diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index ed5e318a..d2086c57 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -1,4 +1,5 @@ import logging +import re from typing import Dict, Any, List, Optional, Tuple, Union from modules.datamodels.datamodelChat import PromptPlaceholder @@ -189,3 +190,69 @@ class AiService: prompt, documents, placeholders, options, outputFormat, title, documentProcessor, documentGenerator ) + + def sanitizePromptContent(self, content: str, contentType: str = "text") -> str: + """ + Centralized prompt content sanitization to prevent injection attacks and ensure safe presentation. + + This is the single source of truth for all prompt sanitization across the system. + Replaces all scattered sanitization functions with a unified approach. + + Args: + content: The content to sanitize + contentType: Type of content ("text", "userinput", "json", "document") + + Returns: + Safely sanitized content ready for AI prompt insertion + """ + if not content: + return "" + + try: + # Convert to string if not already + content_str = str(content) + + # Remove null bytes and control characters (except newlines and tabs) + sanitized = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', content_str) + + # Handle different content types with appropriate sanitization + if contentType == "userinput": + # Extra security for user-controlled content + # Escape curly braces to prevent placeholder injection + sanitized = sanitized.replace('{', '{{').replace('}', '}}') + # Escape quotes and wrap in single quotes + sanitized = sanitized.replace('"', '\\"').replace("'", "\\'") + return f"'{sanitized}'" + + elif contentType == "json": + # For JSON content, escape quotes and backslashes + sanitized = sanitized.replace('\\', '\\\\') + sanitized = sanitized.replace('"', '\\"') + sanitized = sanitized.replace('\n', '\\n') + sanitized = sanitized.replace('\r', '\\r') + sanitized = sanitized.replace('\t', '\\t') + + elif contentType == "document": + # For document content, escape special characters + sanitized = sanitized.replace('\\', '\\\\') + sanitized = sanitized.replace('"', '\\"') + sanitized = sanitized.replace("'", "\\'") + sanitized = sanitized.replace('\n', '\\n') + sanitized = sanitized.replace('\r', '\\r') + sanitized = sanitized.replace('\t', '\\t') + + else: # contentType == "text" or default + # Basic text sanitization + sanitized = sanitized.replace('\\', '\\\\') + sanitized = sanitized.replace('"', '\\"') + sanitized = sanitized.replace("'", "\\'") + sanitized = sanitized.replace('\n', '\\n') + sanitized = sanitized.replace('\r', '\\r') + sanitized = sanitized.replace('\t', '\\t') + + return sanitized + + except Exception as e: + logger.error(f"Error sanitizing prompt content: {str(e)}") + # Return a safe fallback + return "[ERROR: Content could not be safely sanitized]" diff --git a/modules/services/serviceAi/subCoreAi.py b/modules/services/serviceAi/subCoreAi.py index 7d1b849b..84ef012f 100644 --- a/modules/services/serviceAi/subCoreAi.py +++ b/modules/services/serviceAi/subCoreAi.py @@ -75,38 +75,105 @@ class SubCoreAi: else: full_prompt = prompt - # Timestamp-only prompt debug writing removed + # Check for unresolved placeholders and clean them up + try: + import re + # Find only {{KEY:...}} patterns that need to be removed + unresolved_placeholders = re.findall(r'\{\{KEY:[^}]+\}\}', full_prompt) + if unresolved_placeholders: + logger.warning(f"Found unresolved KEY placeholders in prompt: {unresolved_placeholders}") + # Remove only {{KEY:...}} patterns, leave other {{...}} content intact + full_prompt = re.sub(r'\{\{KEY:[^}]+\}\}', '', full_prompt) + # Clean up extra whitespace + full_prompt = re.sub(r'\n\s*\n\s*\n', '\n\n', full_prompt) + full_prompt = full_prompt.strip() + logger.info("Cleaned up unresolved KEY placeholders from prompt") + except Exception as e: + logger.warning(f"Error cleaning up prompt placeholders: {str(e)}") + + # Log the final integrated prompt that AI will receive + try: + from modules.shared.debugLogger import writeDebugFile + # Determine the prompt type based on operation type + if options.operationType == OperationType.GENERATE_PLAN: + prompt_type = "taskplanPrompt" + elif options.operationType == OperationType.ANALYSE_CONTENT: + prompt_type = "analysisPrompt" + else: + prompt_type = "aiPrompt" + + writeDebugFile(full_prompt, prompt_type, documents) + except Exception: + pass # Don't fail on debug logging except Exception: pass - # Handle document generation with specific output format + # Handle document generation with specific output format using unified approach if outputFormat and documentGenerator: - result = await documentGenerator.callAiWithDocumentGeneration(prompt, documents, options, outputFormat, title) - # Log AI response for debugging + # Use unified generation method for all document generation + if documents and len(documents) > 0: + # Extract content from documents first + logger.info(f"Extracting content from {len(documents)} documents") + extracted_content = await documentProcessor.callAiText(full_prompt, documents, options) + # Generate with extracted content + generated_json = await self._callAiUnifiedGeneration(full_prompt, extracted_content, options, outputFormat, title) + else: + # Direct generation without documents + logger.info("No documents provided - using direct generation") + generated_json = await self._callAiUnifiedGeneration(full_prompt, None, options, outputFormat, title) + + # Parse the generated JSON try: - if isinstance(result, dict) and 'content' in result: - self._writeAiResponseDebug( - label='ai_document_generation', - content=result['content'], - partIndex=1, - modelName=None, # Document generation doesn't return model info - continuation=False - ) - except Exception: - pass - return result + import json + generated_data = json.loads(generated_json) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse generated JSON: {str(e)}") + return {"success": False, "error": f"Generated content is not valid JSON: {str(e)}"} + + # Render to final format using the existing renderer + try: + from modules.services.serviceGeneration.mainServiceGeneration import GenerationService + generationService = GenerationService(self.services) + rendered_content, mime_type = await generationService.renderReport( + generated_data, outputFormat, title or "Generated Document", full_prompt, self + ) + + # Build result in the expected format + result = { + "success": True, + "content": generated_data, + "documents": [{ + "documentName": f"generated.{outputFormat}", + "documentData": rendered_content, + "mimeType": mime_type, + "title": title or "Generated Document" + }], + "is_multi_file": False, + "format": outputFormat, + "title": title, + "split_strategy": "single", + "total_documents": 1, + "processed_documents": 1 + } + + # Log AI response for debugging + try: + from modules.shared.debugLogger import writeDebugFile + writeDebugFile(str(result), "documentGenerationResponse", documents) + except Exception: + pass + return result + + except Exception as e: + logger.error(f"Error rendering document: {str(e)}") + return {"success": False, "error": f"Rendering failed: {str(e)}"} if call_type == "planning": result = await self._callAiPlanning(prompt, placeholders_dict, placeholders_meta, options) # Log AI response for debugging try: - self._writeAiResponseDebug( - label='ai_planning', - content=result or "", - partIndex=1, - modelName=None, # Planning doesn't return model info - continuation=False - ) + from modules.shared.debugLogger import writeDebugFile + writeDebugFile(str(result or ""), "taskplanResponse", documents) except Exception: pass return result @@ -125,31 +192,13 @@ class SubCoreAi: if documentProcessor and documents: result = await documentProcessor.callAiText(full_prompt, documents, options) else: - # Fallback to direct AI call if no document processor available - request = AiCallRequest( - prompt=full_prompt, - context="", - options=options - ) - response = await self.aiObjects.call(request) - result = response.content - - # Emit stats for direct AI call - self.services.workflow.storeWorkflowStat( - self.services.currentWorkflow, - response, - f"ai.call.{options.operationType}" - ) + # Enhanced direct AI call with partial results support + result = await self._callAiWithPartialResults(full_prompt, options) # Log AI response for debugging (additional logging for text calls) try: - self._writeAiResponseDebug( - label='ai_text_main', - content=result or "", - partIndex=1, - modelName=None, # Text calls already log internally - continuation=False - ) + from modules.shared.debugLogger import writeDebugFile + writeDebugFile(str(result or ""), "aiTextResponse", documents) except Exception: pass return result @@ -349,6 +398,253 @@ class SubCoreAi: pass return response.content + async def _callAiWithPartialResults( + self, + prompt: str, + options: AiCallOptions + ) -> str: + """ + Call AI with partial results continuation logic for direct calls. + Handles cases where AI needs to generate large responses in chunks. + """ + logger.info("Starting direct AI call with partial results support") + + # Build enhanced prompt with continuation instructions + enhanced_prompt = self._buildDirectContinuationPrompt(prompt) + + # Process with continuation logic + return await self._processDirectWithContinuationLoop(enhanced_prompt, options) + + def _buildDirectContinuationPrompt(self, base_prompt: str) -> str: + """ + Build a prompt for direct AI calls that includes partial results instructions. + """ + continuation_instructions = """ + +IMPORTANT: If your response is too large to generate completely in one response, you can deliver partial results and continue. + +CONTINUATION LOGIC: +- If you cannot complete the full response, end your response with: + [CONTINUE: brief description of what still needs to be generated] +- The system will call you again to continue from where you left off +- Continue generating from the exact point where you stopped +- Maintain consistency with your previous partial response +- Only stop when you have generated the complete response + +Examples: + +Example - Code Generation: +If generating a large code file and you can only generate part of it: +- Generate the first part (imports, classes, functions) +- End with: [CONTINUE: Generate the remaining methods and main execution code] +- In the next call, continue from where you left off + +Example - Documentation: +If writing comprehensive documentation and you can only generate sections 1-3: +- Generate sections 1-3 with full content +- End with: [CONTINUE: Generate sections 4-8 covering advanced topics and examples] +- In the next call, continue with sections 4-8 + +This allows you to handle very large responses that exceed normal limits. +""" + + return f"{base_prompt}{continuation_instructions}" + + async def _processDirectWithContinuationLoop( + self, + enhanced_prompt: str, + options: AiCallOptions + ) -> str: + """ + Process direct AI call with continuation loop until complete. + """ + max_iterations = 10 # Prevent infinite loops + iteration = 0 + accumulated_content = [] + continuation_hint = None + + while iteration < max_iterations: + iteration += 1 + logger.info(f"Direct AI continuation iteration {iteration}/{max_iterations}") + + # Build prompt for this iteration + if continuation_hint: + iteration_prompt = self._buildDirectContinuationIterationPrompt( + enhanced_prompt, continuation_hint, accumulated_content + ) + else: + iteration_prompt = enhanced_prompt + + # Make AI call for this iteration + try: + request = AiCallRequest( + prompt=iteration_prompt, + context="", + options=options + ) + response = await self.aiObjects.call(request) + result = response.content + + # Emit stats for this iteration + self.services.workflow.storeWorkflowStat( + self.services.currentWorkflow, + response, + f"ai.call.{options.operationType}.iteration_{iteration}" + ) + + if not result or not result.strip(): + logger.warning(f"Iteration {iteration}: Empty response, stopping") + break + + # Check for continuation marker + if "[CONTINUE:" in result: + # Extract the continuation hint + import re + continue_match = re.search(r'\[CONTINUE:\s*([^\]]+)\]', result) + if continue_match: + continuation_hint = continue_match.group(1).strip() + # Remove the continuation marker from the result + result = re.sub(r'\s*\[CONTINUE:[^\]]+\]', '', result).strip() + else: + continuation_hint = "Continue from where you left off" + + # Add this partial result to accumulated content + if result.strip(): + accumulated_content.append(result.strip()) + + logger.info(f"Iteration {iteration}: Partial result added, continue hint: {continuation_hint}") + else: + # No continuation marker - this is the final result + if result.strip(): + accumulated_content.append(result.strip()) + + logger.info(f"Direct AI continuation complete after {iteration} iterations") + break + + except Exception as e: + logger.error(f"Direct AI iteration {iteration} failed: {str(e)}") + break + + if iteration >= max_iterations: + logger.warning(f"Direct AI continuation stopped after maximum iterations ({max_iterations})") + + # For JSON responses, we need to merge them properly instead of concatenating + if accumulated_content: + import json + # Parse each part as JSON and merge them + merged_documents = [] + merged_metadata = None + + for content in accumulated_content: + parsed = json.loads(content) + if isinstance(parsed, dict): + # Extract metadata from first valid JSON + if merged_metadata is None and "metadata" in parsed: + merged_metadata = parsed["metadata"] + + # Extract documents from this part + if "documents" in parsed and isinstance(parsed["documents"], list): + merged_documents.extend(parsed["documents"]) + + # Create final merged JSON - NO FALLBACK + final_result = json.dumps({ + "metadata": merged_metadata or { + "title": "Generated Document", + "splitStrategy": "single_document", + "source_documents": [], + "extraction_method": "ai_generation" + }, + "documents": merged_documents + }, indent=2) + else: + # Return empty JSON structure if no content + final_result = json.dumps({ + "metadata": { + "title": "Generated Document", + "splitStrategy": "single_document", + "source_documents": [], + "extraction_method": "ai_generation" + }, + "documents": [] + }, indent=2) + + logger.info(f"Final direct AI result: {len(accumulated_content)} parts from {iteration} iterations") + return final_result + + def _buildDirectContinuationIterationPrompt( + self, + base_prompt: str, + continuation_hint: str, + accumulated_content: List[str] + ) -> str: + """ + Build a prompt for continuation iteration with context. + """ + # Build context of what's already been generated + context_summary = "PREVIOUSLY GENERATED CONTENT:\n" + for i, content in enumerate(accumulated_content[-2:]): # Show last 2 parts for context + preview = content[:200] + "..." if len(content) > 200 else content + context_summary += f"Part {i+1}: {preview}\n" + + continuation_prompt = f""" +{base_prompt} + +{context_summary} + +CONTINUATION INSTRUCTIONS: +- Continue from where you left off +- Continuation hint: {continuation_hint} +- Generate the next part of the content +- Maintain consistency with previously generated content +- End with [CONTINUE: description] if more content is needed +- End without [CONTINUE] if the response is complete +""" + + return continuation_prompt + + async def _callAiUnifiedGeneration( + self, + prompt: str, + extracted_content: Optional[str] = None, + options: Optional[AiCallOptions] = None, + outputFormat: str = "json", + title: str = "Generated Document" + ) -> str: + """ + Unified generation method that handles both scenarios: + - With extracted content (from documents) + - Without extracted content (direct generation) + + Always uses continuation logic for long responses. + Always returns standardized JSON format using the multi-document schema. + """ + if options is None: + options = AiCallOptions() + + logger.info("Starting unified AI generation with continuation logic") + + # Use the existing buildGenerationPrompt to get the proper canonical format instructions + from modules.services.serviceGeneration.subPromptBuilder import buildGenerationPrompt + + # Build the generation prompt using the existing system + generation_prompt = await buildGenerationPrompt( + outputFormat=outputFormat, + userPrompt=prompt, + title=title, + aiService=self, + services=self.services + ) + + # If we have extracted content, prepend it to the prompt + if extracted_content: + generation_prompt = f"""EXTRACTED CONTENT FROM DOCUMENTS: +{extracted_content} + +{generation_prompt}""" + + # Use continuation logic for long responses + return await self._processDirectWithContinuationLoop(generation_prompt, options) + async def _callAiDirect( self, prompt: str, @@ -503,10 +799,6 @@ class SubCoreAi: return full_prompt - def _writeAiResponseDebug(self, label: str, content: Any, partIndex: int = 1, modelName: str = None, continuation: bool = None) -> None: - """Disabled verbose debug writing; only minimal files elsewhere.""" - return - def _exceedsTokenLimit(self, text: str, model: ModelCapabilities, safety_margin: float) -> bool: """ Check if text exceeds model token limit with safety margin. diff --git a/modules/services/serviceAi/subDocumentGeneration.py b/modules/services/serviceAi/subDocumentGeneration.py index 3d3e1c91..a4eded1d 100644 --- a/modules/services/serviceAi/subDocumentGeneration.py +++ b/modules/services/serviceAi/subDocumentGeneration.py @@ -30,8 +30,8 @@ class SubDocumentGeneration: title: Optional[str] ) -> Dict[str, Any]: """ - Handle AI calls with document generation in specific output format. - Now supports both single-file and multi-file generation. + Unified document generation method that handles both single and multi-file cases. + Always uses multi-file approach internally. Args: prompt: The main prompt for the AI call @@ -41,599 +41,396 @@ class SubDocumentGeneration: title: Optional title for generated documents Returns: - Dict with generated documents and metadata + Dict with generated documents and metadata in unified structure """ try: - # Use AI to analyze prompt intent - prompt_analysis = await self._analyzePromptIntent(prompt, self) - logger.info(f"Prompt analysis result: {prompt_analysis}") + # 1. Analyze prompt intent + promptAnalysis = await self._analyzePromptIntent(prompt, self) + logger.info(f"Prompt analysis result: {promptAnalysis}") - if prompt_analysis.get("is_multi_file", False): - return await self._callAiWithMultiFileGeneration( - prompt, documents, options, outputFormat, title, prompt_analysis - ) - else: - return await self._callAiWithSingleFileGeneration( - prompt, documents, options, outputFormat, title - ) + # 2. Get unified extraction prompt + from modules.services.serviceGeneration.mainServiceGeneration import GenerationService + generationService = GenerationService(self.services) + + extractionPrompt = await generationService.getAdaptiveExtractionPrompt( + outputFormat=outputFormat, + userPrompt=prompt, + title=title, + promptAnalysis=promptAnalysis, + aiService=self + ) + + # 3. Process with unified pipeline (always multi-file approach) + aiResponse = await self._processDocumentsUnified( + documents, extractionPrompt, options + ) + + # 4. Return unified result structure + return await self._buildUnifiedResult(aiResponse, outputFormat, title, promptAnalysis) except Exception as e: - logger.error(f"Error in document generation: {str(e)}") - return { - "success": False, - "error": str(e), - "content": "", - "rendered_content": "", - "mime_type": "text/plain", - "filename": f"error_{outputFormat}", - "format": outputFormat, - "title": title or "Error", - "documents": [] - } + logger.error(f"Error in unified document generation: {str(e)}") + return self._buildErrorResult(str(e), outputFormat, title) - async def _callAiWithSingleFileGeneration( + async def _processDocumentsUnified( self, - prompt: str, documents: Optional[List[ChatDocument]], - options: AiCallOptions, - outputFormat: str, - title: Optional[str], - generationPrompt: Optional[str] = None + extractionPrompt: str, + options: AiCallOptions ) -> Dict[str, Any]: - """Handle single-file document generation (existing functionality).""" + """ + Unified document processing that handles both single and multi-file cases. + Always processes as multi-file structure internally. + """ import time # Create progress logger workflow = self.services.currentWorkflow progressLogger = self.services.workflow.createProgressLogger(workflow) - operationId = f"docGenSingle_{workflow.id}_{int(time.time())}" + operationId = f"docGenUnified_{workflow.id}_{int(time.time())}" try: # Start progress tracking progressLogger.startOperation( operationId, "Generate", - "Single-file Generation", + "Unified Document Generation", f"Processing {len(documents) if documents else 0} documents" ) - # Get format-specific extraction prompt from generation service - from modules.services.serviceGeneration.mainServiceGeneration import GenerationService - generation_service = GenerationService(self.services) - - # Use default title if not provided - if not title: - title = "AI Generated Document" - # Update progress - generating extraction prompt progressLogger.updateProgress(operationId, 0.1, "Generating prompt") - # Get format-specific extraction prompt - extractionPrompt = await generation_service.getExtractionPrompt( - outputFormat=outputFormat, - userPrompt=prompt, - title=title, - aiService=self + # Process with unified JSON pipeline using continuation logic + aiResponse = await self.documentProcessor.processDocumentsWithContinuation( + documents, extractionPrompt, options ) - # Update progress - starting AI processing - progressLogger.updateProgress(operationId, 0.3, "AI processing") - - # Process documents with format-specific prompt using JSON mode with chunking - # This ensures structured JSON output instead of text and handles large documents - aiResponseJson = await self._callAiJsonWithChunking(extractionPrompt, documents, options, progressLogger, operationId) - # Update progress - AI processing completed progressLogger.updateProgress(operationId, 0.6, "Processing done") - # Validate JSON response - if not isinstance(aiResponseJson, dict) or "sections" not in aiResponseJson: - raise Exception("AI response is not valid JSON document structure") + # Log the AI response for debugging + logger.info(f"AI response received for validation:") + logger.info(f" - Type: {type(aiResponse)}") + logger.info(f" - Keys: {list(aiResponse.keys()) if isinstance(aiResponse, dict) else 'Not a dict'}") + logger.info(f" - Content: {aiResponse}") - # Emit raw extracted data as a chat message attachment before rendering + # Validate response structure + if not self._validateUnifiedResponseStructure(aiResponse): + raise Exception("AI response is not valid unified document structure") + + # Emit raw extracted data as a chat message attachment try: - await self._postRawDataChatMessage(aiResponseJson, label="raw_extraction_single") + await self._postRawDataChatMessage(aiResponse, label="raw_extraction_unified") except Exception: - logger.warning("Failed to emit raw extraction chat message (single-file)") + logger.warning("Failed to emit raw extraction chat message (unified)") + + # Complete progress tracking + progressLogger.completeOperation(operationId, True) + + return aiResponse + + except Exception as e: + logger.error(f"Error in unified document processing: {str(e)}") + progressLogger.completeOperation(operationId, False) + raise - # Generate filename from document metadata - parsedFilename = None - try: - if aiResponseJson.get("metadata", {}).get("title"): - title = aiResponseJson["metadata"]["title"] - # Clean title for filename - import re - parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", title) - parsed = re.sub(r"-+", "-", parsed).strip('-') - if parsed: - parsedFilename = f"{parsed}.{outputFormat}" - except Exception: - parsedFilename = None + def _validateUnifiedResponseStructure(self, response: Dict[str, Any]) -> bool: + """ + Unified validation that checks for document structure. + Handles both multi-file (documents array) and single-file (sections array) structures. + """ + try: + if not isinstance(response, dict): + logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}") + return False + + # Check for documents array (multi-file structure) + hasDocuments = "documents" in response + isDocumentsList = isinstance(response.get("documents"), list) + + # Check for sections array (single-file structure) + hasSections = "sections" in response + isSectionsList = isinstance(response.get("sections"), list) + + if hasDocuments and isDocumentsList: + # Multi-file structure + documents = response.get("documents", []) + if not documents: + logger.warning("Unified validation failed: documents array is empty") + return False + + # Validate each document individually + validDocuments = 0 + for i, doc in enumerate(documents): + if self._validateDocumentStructure(doc, i): + validDocuments += 1 + else: + logger.warning(f"Document {i} failed validation, but continuing with others") + + # Process succeeds if at least one document is valid + if validDocuments == 0: + logger.error("Unified validation failed: no valid documents found") + return False + + logger.info(f"Unified validation passed: {validDocuments}/{len(documents)} documents valid") + return True + + elif hasSections and isSectionsList: + # Single-file structure - convert to multi-file format + logger.info("Converting single-file structure to multi-file format") + sections = response.get("sections", []) + if not sections: + logger.warning("Unified validation failed: sections array is empty") + return False + + # Convert to documents array format + response["documents"] = [{ + "id": "document_1", + "title": response.get("metadata", {}).get("title", "Generated Document"), + "filename": "document_1", + "sections": sections + }] + + logger.info("Successfully converted single-file structure to multi-file format") + return True + + else: + # No valid structure found - fail with clear error details + logger.error("Unified validation failed: No valid structure found") + logger.error(f"Response type: {type(response)}") + logger.error(f"Available keys: {list(response.keys()) if isinstance(response, dict) else 'Not a dict'}") + logger.error(f"hasDocuments={hasDocuments}, isDocumentsList={isDocumentsList}") + logger.error(f"hasSections={hasSections}, isSectionsList={isSectionsList}") + logger.error(f"Full response: {response}") + return False + + except Exception as e: + logger.warning(f"Unified response validation failed with exception: {str(e)}") + return False + + def _validateDocumentStructure(self, document: Dict[str, Any], documentIndex: int) -> bool: + """ + Validate individual document structure. + Returns True if document is valid, False otherwise. + Does not fail the entire process if one document is invalid. + """ + try: + if not isinstance(document, dict): + logger.error(f"Document {documentIndex} validation failed: not a dict, got {type(document)}") + logger.error(f"Document {documentIndex} content: {document}") + return False + + # Check for required fields + hasTitle = "title" in document + hasSections = "sections" in document + isSectionsList = isinstance(document.get("sections"), list) + + logger.debug(f"Document {documentIndex} structure check:") + logger.debug(f" - hasTitle: {hasTitle}") + logger.debug(f" - hasSections: {hasSections}") + logger.debug(f" - isSectionsList: {isSectionsList}") + logger.debug(f" - available keys: {list(document.keys())}") + + if not (hasTitle and hasSections and isSectionsList): + logger.error(f"Document {documentIndex} validation failed:") + logger.error(f" - title present: {hasTitle}") + logger.error(f" - sections present: {hasSections}") + logger.error(f" - sections is list: {isSectionsList}") + logger.error(f" - document content: {document}") + return False + + sections = document.get("sections", []) + if not sections: + logger.error(f"Document {documentIndex} validation failed: sections array is empty") + logger.error(f" - document content: {document}") + return False + + logger.info(f"Document {documentIndex} validation passed") + return True + + except Exception as e: + logger.error(f"Document {documentIndex} validation failed with exception: {str(e)}") + logger.error(f" - document content: {document}") + return False + + async def _buildUnifiedResult( + self, + aiResponse: Dict[str, Any], + outputFormat: str, + title: str, + promptAnalysis: Dict[str, Any] + ) -> Dict[str, Any]: + """ + Build unified result structure that always returns array-based format. + Content is always a multi-document structure. + """ + try: + # Process all documents uniformly + generatedDocuments = [] + documents = aiResponse.get("documents", []) + + for i, docData in enumerate(documents): + try: + processedDocument = await self._processDocument( + docData, outputFormat, title, i + ) + generatedDocuments.append(processedDocument) + except Exception as e: + logger.warning(f"Failed to process document {i}: {str(e)}, skipping") + continue + + if not generatedDocuments: + raise Exception("No documents could be processed successfully") + + # Build unified result + result = { + "success": True, + "content": aiResponse, # Always multi-document structure + "documents": generatedDocuments, # Always array + "is_multi_file": len(generatedDocuments) > 1, + "format": outputFormat, + "title": title, + "split_strategy": promptAnalysis.get("strategy", "single"), + "total_documents": len(generatedDocuments), + "processed_documents": len(generatedDocuments) + } + + return result + + except Exception as e: + logger.error(f"Error building unified result: {str(e)}") + return self._buildErrorResult(str(e), outputFormat, title) + + async def _processDocument( + self, + docData: Dict[str, Any], + outputFormat: str, + title: str, + documentIndex: int + ) -> Dict[str, Any]: + """ + Process individual document with content enhancement and rendering. + """ + try: + # Get generation service + from modules.services.serviceGeneration.mainServiceGeneration import GenerationService + generationService = GenerationService(self.services) # Use AI generation to enhance the extracted JSON before rendering - enhancedContent = aiResponseJson # Default to original - if prompt: + enhancedContent = docData # Default to original + if docData.get("sections"): try: - from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType - # Get generation prompt - generationPrompt = await generation_service.getGenerationPrompt( + generationPrompt = await generationService.getGenerationPrompt( outputFormat=outputFormat, - userPrompt=prompt, - title=title, + userPrompt=title, + title=docData.get("title", title), aiService=self ) # Prepare the AI call - request_options = AiCallOptions() - request_options.operationType = OperationType.GENERAL + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + requestOptions = AiCallOptions() + requestOptions.operationType = OperationType.GENERAL # Create context with the extracted JSON content import json - context = f"Extracted JSON content:\n{json.dumps(aiResponseJson, indent=2)}" + context = f"Extracted JSON content:\n{json.dumps(docData, indent=2)}" request = AiCallRequest( prompt=generationPrompt, context=context, - options=request_options + options=requestOptions ) # Call AI to enhance the content response = await self.aiObjects.call(request) - # Save generation prompt and response to debug - try: - from modules.shared.debugLogger import writeDebugFile - debugData = { - "output_format": outputFormat, - "title": title, - "context_length": len(context), - "extracted_content_keys": list(aiResponseJson.keys()) if isinstance(aiResponseJson, dict) else [] - } - writeDebugFile(generationPrompt, "generation_single", debugData) - writeDebugFile(response.content or '', "generation_single_response") - except Exception: - pass - if response and response.content: # Parse the AI response as JSON try: import re result = response.content.strip() - # Check if result is empty after stripping - if not result: - logger.warning("AI generation returned empty content after stripping, using original content") - enhancedContent = aiResponseJson - else: - # Extract JSON from markdown if present - json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) - if json_match: - result = json_match.group(1).strip() - elif result.startswith('```json'): - result = re.sub(r'^```json\s*', '', result) - result = re.sub(r'\s*```$', '', result) - elif result.startswith('```'): - result = re.sub(r'^```\s*', '', result) - result = re.sub(r'\s*```$', '', result) - - # Check if result is still empty after markdown extraction - if not result: - logger.warning("AI generation returned empty content after markdown extraction, using original content") - enhancedContent = aiResponseJson - else: - # Try to parse JSON with better error handling - try: - enhancedContent = json.loads(result) - logger.info(f"AI enhanced JSON content successfully") - except json.JSONDecodeError as jsonError: - # Try to fix common JSON issues - fixed_result = self._attemptJsonFix(result) - if fixed_result != result: - try: - enhancedContent = json.loads(fixed_result) - logger.info(f"AI enhanced JSON content successfully after fixing") - except json.JSONDecodeError: - logger.warning(f"AI generation returned invalid JSON even after fixing: {str(jsonError)}, using original content") - enhancedContent = aiResponseJson - else: - logger.warning(f"AI generation returned invalid JSON: {str(jsonError)}, using original content") - enhancedContent = aiResponseJson + # Extract JSON from markdown if present + jsonMatch = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) + if jsonMatch: + result = jsonMatch.group(1).strip() + elif result.startswith('```json'): + result = re.sub(r'^```json\s*', '', result) + result = re.sub(r'\s*```$', '', result) + elif result.startswith('```'): + result = re.sub(r'^```\s*', '', result) + result = re.sub(r'\s*```$', '', result) + + # Try to parse JSON + enhancedContent = json.loads(result) + logger.info(f"AI enhanced JSON content successfully for document {documentIndex}") except json.JSONDecodeError as e: - logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content") - enhancedContent = aiResponseJson + logger.warning(f"AI generation returned invalid JSON for document {documentIndex}: {str(e)}, using original content") + enhancedContent = docData else: - logger.warning("AI generation returned empty response, using original content") - enhancedContent = aiResponseJson + logger.warning(f"AI generation returned empty response for document {documentIndex}, using original content") + enhancedContent = docData except Exception as e: - logger.warning(f"AI generation failed: {str(e)}, using original content") - enhancedContent = aiResponseJson + logger.warning(f"AI generation failed for document {documentIndex}: {str(e)}, using original content") + enhancedContent = docData # Render the enhanced JSON content - renderedContent, mimeType = await generation_service.renderReport( + renderedContent, mimeType = await generationService.renderReport( extractedContent=enhancedContent, outputFormat=outputFormat, - title=title, - userPrompt=prompt, + title=docData.get("title", title), + userPrompt=title, aiService=self ) - # Generate meaningful filename (use AI-provided if valid, else fallback) - from datetime import datetime, UTC - timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") - if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"): - filename = parsedFilename + # Generate proper filename + baseFilename = docData.get("filename", f"document_{documentIndex + 1}") + if '.' in baseFilename: + baseFilename = baseFilename.rsplit('.', 1)[0] + + # Add proper extension based on output format + if outputFormat.lower() == "docx": + filename = f"{baseFilename}.docx" + elif outputFormat.lower() == "pdf": + filename = f"{baseFilename}.pdf" + elif outputFormat.lower() == "html": + filename = f"{baseFilename}.html" else: - safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-') - filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}" + filename = f"{baseFilename}.{outputFormat}" - # Update progress - generation completed - progressLogger.updateProgress(operationId, 0.9, "Rendering") - - result = { - "success": True, - "content": aiResponseJson, # Structured JSON document - "rendered_content": renderedContent, # Formatted content - "mime_type": mimeType, - "filename": filename, - "format": outputFormat, - "title": title, - "documents": [{ - "documentName": filename, - "documentData": renderedContent, - "mimeType": mimeType - }], - "is_multi_file": False + return { + "documentName": filename, + "documentData": renderedContent, + "mimeType": mimeType, + "title": docData.get("title", title), + "documentIndex": documentIndex } - - # Complete progress tracking - progressLogger.completeOperation(operationId, True) - - return result - + except Exception as e: - logger.error(f"Error in single-file document generation: {str(e)}") - # Complete progress tracking with failure - progressLogger.completeOperation(operationId, False) + logger.error(f"Error processing document {documentIndex}: {str(e)}") raise - async def _callAiWithMultiFileGeneration( - self, - prompt: str, - documents: Optional[List[ChatDocument]], - options: AiCallOptions, - outputFormat: str, - title: Optional[str], - prompt_analysis: Dict[str, Any] - ) -> Dict[str, Any]: - """Handle multi-file document generation using AI analysis.""" - import time - - # Create progress logger - workflow = self.services.currentWorkflow - progressLogger = self.services.workflow.createProgressLogger(workflow) - operationId = f"docGen_{workflow.id}_{int(time.time())}" - - try: - # Start progress tracking - progressLogger.startOperation( - operationId, - "Generate", - "Multi-file Generation", - f"Processing {len(documents) if documents else 0} documents" - ) - - # Get multi-file extraction prompt based on AI analysis - from modules.services.serviceGeneration.mainServiceGeneration import GenerationService - generation_service = GenerationService(self.services) - - # Use default title if not provided - if not title: - title = "AI Generated Documents" - - # Update progress - generating extraction prompt - progressLogger.updateProgress(operationId, 0.1, "Generating prompt") - - # Get adaptive extraction prompt - extraction_prompt = await generation_service.getAdaptiveExtractionPrompt( - outputFormat=outputFormat, - userPrompt=prompt, - title=title, - promptAnalysis=prompt_analysis, - aiService=self - ) - - logger.info(f"Adaptive extraction prompt length: {len(extraction_prompt)} characters") - logger.debug(f"Adaptive extraction prompt preview: {extraction_prompt[:500]}...") - - # Update progress - starting document processing - progressLogger.updateProgress(operationId, 0.2, "Processing docs") - - # Process with adaptive JSON schema - use the existing pipeline but with adaptive prompt - logger.info(f"Using adaptive prompt with existing pipeline: {len(extraction_prompt)} chars") - logger.debug(f"Processing documents: {len(documents) if documents else 0} documents") - - # Use the existing pipeline but replace the prompt with our adaptive one - # This ensures proper document processing while using the multi-file prompt - ai_response = await self.documentProcessor.processDocumentsPerChunkJsonWithPrompt(documents, extraction_prompt, options) - - logger.info(f"AI response type: {type(ai_response)}") - logger.info(f"AI response keys: {list(ai_response.keys()) if isinstance(ai_response, dict) else 'Not a dict'}") - logger.debug(f"AI response preview: {str(ai_response)[:500]}...") - - # Validate response structure - if not self._validateResponseStructure(ai_response, prompt_analysis): - # Fallback to single-file if multi-file fails - logger.warning(f"Multi-file processing failed - Invalid response structure. Expected multi-file but got: {list(ai_response.keys()) if isinstance(ai_response, dict) else type(ai_response)}") - logger.warning(f"Prompt analysis: {prompt_analysis}") - logger.warning("Falling back to single-file generation") - return await self._callAiWithSingleFileGeneration( - prompt, documents, options, outputFormat, title - ) - - # Emit raw extracted data as a chat message attachment before transformation/rendering - try: - await self._postRawDataChatMessage(ai_response, label="raw_extraction_multi") - except Exception: - logger.warning("Failed to emit raw extraction chat message (multi-file)") - - # Process multiple documents - generated_documents = [] - for i, doc_data in enumerate(ai_response.get("documents", [])): - # Transform AI-generated sections to renderer-compatible format - transformed_sections = [] - for section in doc_data.get("sections", []): - # Convert AI format to renderer format - transformed_section = { - "id": section.get("id", f"section_{len(transformed_sections) + 1}"), - "content_type": section.get("content_type", "paragraph"), - "elements": section.get("elements", []), - "order": section.get("order", len(transformed_sections) + 1) - } - - # Extract text from elements for simple text-based sections - if section.get("content_type") in ["paragraph", "heading"]: - text_parts = [] - for element in section.get("elements", []): - if "text" in element: - text_parts.append(element["text"]) - # Add text to the first element or create a new one - if transformed_section["elements"]: - transformed_section["elements"][0]["text"] = "\n".join(text_parts) - else: - transformed_section["elements"] = [{"text": "\n".join(text_parts)}] - - transformed_sections.append(transformed_section) - - # Create complete document structure for rendering - complete_document = { - "metadata": { - "title": doc_data["title"], - "source_document": "multi_file_generation", - "document_id": doc_data.get("id", f"doc_{i+1}"), - "filename": doc_data.get("filename", f"document_{i+1}"), - "split_strategy": prompt_analysis.get("strategy", "custom") - }, - "sections": transformed_sections, - "summary": f"Generated document: {doc_data['title']}", - "tags": ["multi_file", "ai_generated"] - } - - # Use AI generation to enhance the extracted JSON before rendering - enhancedContent = complete_document # Default to original - if prompt: - try: - from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType - - # Get generation prompt - generationPrompt = await generation_service.getGenerationPrompt( - outputFormat=outputFormat, - userPrompt=prompt, - title=doc_data["title"], - aiService=self - ) - - # Prepare the AI call - request_options = AiCallOptions() - request_options.operationType = OperationType.GENERAL - - # Create context with the extracted JSON content - import json - context = f"Extracted JSON content:\n{json.dumps(complete_document, indent=2)}" - - request = AiCallRequest( - prompt=generationPrompt, - context=context, - options=request_options - ) - - # Call AI to enhance the content - response = await self.aiObjects.call(request) - - # Save generation prompt and response to debug - try: - from modules.shared.debugLogger import writeDebugFile - debugData = { - "output_format": outputFormat, - "title": doc_data["title"], - "document_index": i, - "context_length": len(context), - "extracted_content_keys": list(complete_document.keys()) if isinstance(complete_document, dict) else [] - } - writeDebugFile(generationPrompt, f"generation_multi_doc_{i}", debugData) - writeDebugFile(response.content or '', f"generation_multi_doc_{i}_response") - except Exception: - pass - - if response and response.content: - # Parse the AI response as JSON - try: - import re - result = response.content.strip() - - # Extract JSON from markdown if present - json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) - if json_match: - result = json_match.group(1).strip() - elif result.startswith('```json'): - result = re.sub(r'^```json\s*', '', result) - result = re.sub(r'\s*```$', '', result) - elif result.startswith('```'): - result = re.sub(r'^```\s*', '', result) - result = re.sub(r'\s*```$', '', result) - - # Try to parse JSON - enhancedContent = json.loads(result) - logger.info(f"AI enhanced JSON content successfully") - - except json.JSONDecodeError as e: - logger.warning(f"AI generation returned invalid JSON: {str(e)}, attempting to repair...") - # Try to repair common JSON issues - try: - repaired_result = self._repairJson(result) - enhancedContent = json.loads(repaired_result) - logger.info(f"Successfully repaired JSON content") - except (json.JSONDecodeError, Exception) as repair_error: - logger.warning(f"JSON repair failed: {str(repair_error)}, trying AI repair...") - # Try AI-powered JSON repair as last resort - try: - ai_repaired = await self._repairJsonWithAI(result) - enhancedContent = json.loads(ai_repaired) - logger.info(f"AI successfully repaired JSON content") - except Exception as ai_repair_error: - logger.warning(f"AI JSON repair also failed: {str(ai_repair_error)}, using original content") - enhancedContent = complete_document - else: - logger.warning("AI generation returned empty response, using original content") - enhancedContent = complete_document - - except Exception as e: - logger.warning(f"AI generation failed: {str(e)}, using original content") - enhancedContent = complete_document - - # Render the enhanced JSON content - rendered_content, mime_type = await generation_service.renderReport( - extractedContent=enhancedContent, - outputFormat=outputFormat, - title=doc_data["title"], - userPrompt=prompt, - aiService=self - ) - - # Generate proper filename with correct extension - base_filename = doc_data.get("filename", f"document_{i+1}") - # Remove any existing extension and add the correct one - if '.' in base_filename: - base_filename = base_filename.rsplit('.', 1)[0] - - # Add proper extension based on output format - if outputFormat.lower() == "docx": - filename = f"{base_filename}.docx" - elif outputFormat.lower() == "pdf": - filename = f"{base_filename}.pdf" - elif outputFormat.lower() == "html": - filename = f"{base_filename}.html" - else: - filename = f"{base_filename}.{outputFormat}" - - generated_documents.append({ - "documentName": filename, - "documentData": rendered_content, - "mimeType": mime_type - }) - - # Save debug files for multi-file generation - only if debug enabled - debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) - if debug_enabled: - try: - import os - from datetime import datetime, UTC - ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") - debug_root = "./test-chat/ai" - debug_dir = os.path.join(debug_root, f"multifile_output_{ts}") - os.makedirs(debug_dir, exist_ok=True) - - # Save metadata - with open(os.path.join(debug_dir, "metadata.txt"), "w", encoding="utf-8") as f: - f.write(f"title: {title}\n") - f.write(f"format: {outputFormat}\n") - f.write(f"documents_count: {len(generated_documents)}\n") - f.write(f"split_strategy: {prompt_analysis.get('strategy', 'custom')}\n") - f.write(f"prompt_analysis: {prompt_analysis}\n") - - # Save each generated document - for i, doc in enumerate(generated_documents): - doc_filename = doc["documentName"] - doc_data = doc["documentData"] - doc_mime = doc["mimeType"] - - # Determine file extension - if outputFormat.lower() == "docx": - file_ext = ".docx" - elif outputFormat.lower() == "pdf": - file_ext = ".pdf" - elif outputFormat.lower() == "html": - file_ext = ".html" - else: - file_ext = f".{outputFormat}" - - # Save the rendered document - output_path = os.path.join(debug_dir, f"document_{i+1}_{doc_filename}") - - if file_ext in ['.md', '.txt', '.html', '.json', '.csv']: - # Text-based formats - with open(output_path, 'w', encoding='utf-8') as f: - f.write(doc_data) - else: - # Binary formats - decode from base64 if needed - try: - import base64 - doc_bytes = base64.b64decode(doc_data) - with open(output_path, 'wb') as f: - f.write(doc_bytes) - except Exception: - # If not base64, save as text - with open(output_path, 'w', encoding='utf-8') as f: - f.write(doc_data) - - logger.info(f"šŸ’¾ Debug: Saved multi-file document {i+1}: {output_path}") - - logger.info(f"šŸ’¾ Debug: Multi-file output saved to: {debug_dir}") - - except Exception as e: - logger.warning(f"Failed to save multi-file debug output: {e}") - - # Update progress - generation completed - progressLogger.updateProgress(operationId, 0.9, "Rendering") - - result = { - "success": True, - "content": ai_response, - "rendered_content": None, # Not applicable for multi-file - "mime_type": None, # Not applicable for multi-file - "filename": None, # Not applicable for multi-file - "format": outputFormat, - "title": title, - "documents": generated_documents, - "is_multi_file": True, - "split_strategy": prompt_analysis.get("strategy", "custom") - } - - # Complete progress tracking - progressLogger.completeOperation(operationId, True) - - return result - - except Exception as e: - logger.error(f"Error in multi-file document generation: {str(e)}") - # Complete progress tracking with failure - progressLogger.completeOperation(operationId, False) - # Fallback to single-file - return await self._callAiWithSingleFileGeneration( - prompt, documents, options, outputFormat, title - ) + def _buildErrorResult(self, errorMessage: str, outputFormat: str, title: str) -> Dict[str, Any]: + """ + Build error result with unified structure. + """ + return { + "success": False, + "error": errorMessage, + "content": {}, + "documents": [], + "is_multi_file": False, + "format": outputFormat, + "title": title, + "split_strategy": "error", + "total_documents": 0, + "processed_documents": 0 + } async def _callAiJson( self, @@ -648,90 +445,6 @@ class SubDocumentGeneration: # Process documents with JSON merging return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options) - async def _callAiJsonWithChunking( - self, - prompt: str, - documents: Optional[List[ChatDocument]], - options: AiCallOptions, - progressLogger, - operationId: str - ) -> Dict[str, Any]: - """ - Handle AI calls with document processing for JSON output using chunking for large documents. - Supports continuation when documents are too large for single AI call. - """ - # Initialize the document structure - completeDocument = { - "metadata": {"title": "Generated Document"}, - "sections": [], - "continue": True - } - - continuationContext = None - chunkCount = 0 - maxChunks = 10 # Prevent infinite loops - - while completeDocument.get("continue", False) and chunkCount < maxChunks: - chunkCount += 1 - logger.info(f"Processing generation chunk {chunkCount}") - - # Update progress - progressLogger.updateProgress(operationId, 0.3 + (chunkCount * 0.3 / maxChunks), f"Generating chunk {chunkCount}") - - # Prepare the chunk prompt - if continuationContext: - chunkPrompt = f""" -{prompt} - -CONTINUATION CONTEXT: -- Last completed section: {continuationContext.get('last_section_id', 'none')} -- Last completed element index: {continuationContext.get('last_element_index', 0)} -- Remaining requirements: {continuationContext.get('remaining_requirements', 'complete the document')} - -Continue generating the document from where you left off. Include all previously generated content and add the remaining sections. -""" - else: - chunkPrompt = prompt - - # Call AI for this chunk using the existing document processor - aiResponseJson = await self.documentProcessor.processDocumentsPerChunkJson(documents, chunkPrompt, options) - - # Validate JSON response - if not isinstance(aiResponseJson, dict): - raise Exception("AI response is not valid JSON document structure") - - # Merge the chunk with the complete document - if chunkCount == 1: - # First chunk - use as base - completeDocument = aiResponseJson - else: - # Subsequent chunks - merge sections - if "sections" in aiResponseJson: - # Find the last section ID from continuation context - lastSectionId = continuationContext.get('last_section_id', '') if continuationContext else '' - - # Add new sections after the last completed one - newSections = [] - for section in aiResponseJson["sections"]: - if section.get("id") != lastSectionId: - newSections.append(section) - - completeDocument["sections"].extend(newSections) - - # Check if we need to continue - if aiResponseJson.get("continue", False): - continuationContext = aiResponseJson.get("continuation_context", {}) - logger.info(f"Document generation needs continuation: {continuationContext}") - else: - completeDocument["continue"] = False - logger.info("Document generation completed") - - if chunkCount >= maxChunks: - logger.warning(f"Document generation stopped after {maxChunks} chunks (max limit reached)") - completeDocument["continue"] = False - - return completeDocument - async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]: """Use AI to analyze user prompt and determine processing requirements.""" if not ai_service: @@ -741,7 +454,7 @@ Continue generating the document from where you left off. Include all previously analysis_prompt = f""" Analyze this user request and determine if it requires multiple file output or single file output. -User request: "{prompt}" +User request: "{self.services.ai.sanitizePromptContent(prompt, 'userinput')}" Respond with JSON only in this exact format: {{ @@ -787,33 +500,6 @@ Return only the JSON response. logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file") return {"is_multi_file": False, "strategy": "single", "criteria": None} - def _validateResponseStructure(self, response: Dict[str, Any], prompt_analysis: Dict[str, Any]) -> bool: - """Validate that AI response matches the expected structure.""" - try: - if not isinstance(response, dict): - logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}") - return False - - # Check for multi-file structure - if prompt_analysis.get("is_multi_file", False): - has_documents = "documents" in response - is_documents_list = isinstance(response.get("documents"), list) - logger.info(f"Multi-file validation: has_documents={has_documents}, is_documents_list={is_documents_list}") - if has_documents and is_documents_list: - logger.info(f"Multi-file validation passed: {len(response['documents'])} documents found") - else: - logger.warning(f"Multi-file validation failed: documents key present={has_documents}, documents is list={is_documents_list}") - logger.warning(f"Available keys: {list(response.keys())}") - return has_documents and is_documents_list - else: - has_sections = "sections" in response - is_sections_list = isinstance(response.get("sections"), list) - logger.info(f"Single-file validation: has_sections={has_sections}, is_sections_list={is_sections_list}") - return has_sections and is_sections_list - except Exception as e: - logger.warning(f"Response validation failed with exception: {str(e)}") - return False - async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None: """ Create a ChatMessage with the extracted raw JSON attached as a file so the user @@ -865,153 +551,4 @@ Return only the JSON response. services.workflow.storeMessageWithDocuments(services.workflow.workflow, messageData, [doc]) except Exception: # Non-fatal; ignore if storage or chat creation fails - return - - def _repairJson(self, json_string: str) -> str: - """Repair common JSON syntax errors efficiently for large JSON.""" - try: - import re - import json - - # Remove any leading/trailing whitespace - json_string = json_string.strip() - - # For large JSON, skip substring extraction and go straight to targeted repairs - logger.info(f"Attempting JSON repair for {len(json_string)} characters...") - - # Try to parse first to see what specific error we get - try: - json.loads(json_string) - return json_string # Already valid - except json.JSONDecodeError as e: - error_msg = str(e) - logger.info(f"JSON error: {error_msg}") - - # Apply targeted fixes based on the specific error - if "Expecting ',' delimiter" in error_msg: - # Fix missing commas between array elements - json_string = re.sub(r'\]\s*\[', '], [', json_string) - json_string = re.sub(r'\}\s*\{', '}, {', json_string) - # Fix missing commas between object properties - json_string = re.sub(r'("\s*:\s*[^,}]+)\s*(")', r'\1, \2', json_string) - - if "Expecting value" in error_msg: - # Fix missing values (replace empty with null) - json_string = re.sub(r':\s*,', ': null,', json_string) - json_string = re.sub(r':\s*}', ': null}', json_string) - - if "Expecting property name" in error_msg: - # Fix unquoted property names - json_string = re.sub(r'(\w+):', r'"\1":', json_string) - - # Fix trailing commas before closing brackets/braces - json_string = re.sub(r',(\s*[}\]])', r'\1', json_string) - - # Fix missing closing brackets/braces (only if reasonable) - open_braces = json_string.count('{') - close_braces = json_string.count('}') - open_brackets = json_string.count('[') - close_brackets = json_string.count(']') - - # Only add missing brackets if the difference is small (avoid runaway) - if 0 < (open_braces - close_braces) <= 5: - missing_braces = open_braces - close_braces - json_string += '}' * missing_braces - - if 0 < (open_brackets - close_brackets) <= 5: - missing_brackets = open_brackets - close_brackets - json_string += ']' * missing_brackets - - # Try to parse again - try: - json.loads(json_string) - logger.info("JSON repair successful") - return json_string - except json.JSONDecodeError: - logger.warning("JSON repair failed - will try AI repair") - return json_string - - except Exception as e: - logger.warning(f"JSON repair failed: {str(e)}") - return json_string - - async def _repairJsonWithAI(self, malformed_json: str) -> str: - """Use AI to repair malformed JSON efficiently for large files.""" - try: - # Limit JSON size for AI processing (max 50KB to avoid token limits) - max_json_size = 50000 - json_to_repair = malformed_json - - if len(malformed_json) > max_json_size: - logger.warning(f"JSON too large ({len(malformed_json)} chars), truncating to {max_json_size} chars for AI repair") - # Try to find a good truncation point (end of a complete object/array) - truncate_at = max_json_size - for i in range(max_json_size, max(0, max_json_size - 1000), -1): - if malformed_json[i] in ['}', ']']: - truncate_at = i + 1 - break - json_to_repair = malformed_json[:truncate_at] + "..." - - repair_prompt = f""" -You are a JSON repair expert. Fix the following malformed JSON and return ONLY the corrected JSON, no explanations. - -Malformed JSON: -{json_to_repair} - -Return only the valid JSON: -""" - - # Use AI to repair the JSON - repaired_json = await self.services.ai.callAi( - prompt=repair_prompt, - documents=None, - options={ - "process_type": "text", - "operation_type": "generate_content", - "priority": "speed", - "max_cost": 0.01 - } - ) - - # Clean up the response (remove any markdown formatting) - repaired_json = repaired_json.strip() - if repaired_json.startswith('```json'): - repaired_json = repaired_json[7:] - if repaired_json.endswith('```'): - repaired_json = repaired_json[:-3] - repaired_json = repaired_json.strip() - - # Validate the repaired JSON - import json - json.loads(repaired_json) - logger.info("AI JSON repair successful") - return repaired_json - - except Exception as e: - logger.warning(f"AI JSON repair failed: {str(e)}") - return malformed_json - - def _attemptJsonFix(self, json_string: str) -> str: - """Attempt to fix common JSON issues""" - try: - # Remove any trailing commas before closing braces/brackets - import re - fixed = re.sub(r',(\s*[}\]])', r'\1', json_string) - - # Try to fix unterminated strings by adding quotes at the end - if '"' in fixed and not fixed.strip().endswith('"'): - # Count quotes to see if we have an odd number (unterminated string) - quote_count = fixed.count('"') - if quote_count % 2 == 1: - # Find the last quote and add a closing quote - last_quote_pos = fixed.rfind('"') - if last_quote_pos != -1: - # Check if there's content after the last quote that needs to be quoted - after_quote = fixed[last_quote_pos + 1:].strip() - if after_quote and not after_quote.startswith(','): - # Add closing quote before any trailing content - fixed = fixed[:last_quote_pos + 1] + '"' + after_quote - - return fixed - except Exception: - return json_string + return \ No newline at end of file diff --git a/modules/services/serviceAi/subDocumentProcessing.py b/modules/services/serviceAi/subDocumentProcessing.py index 44dcc41c..9757350a 100644 --- a/modules/services/serviceAi/subDocumentProcessing.py +++ b/modules/services/serviceAi/subDocumentProcessing.py @@ -107,7 +107,7 @@ class SubDocumentProcessing: # Save merged extraction content to debug try: from modules.shared.debugLogger import writeDebugFile - writeDebugFile(mergedContent or '', "extraction_merged") + writeDebugFile(mergedContent or '', "extractionMergedText") except Exception: pass @@ -202,7 +202,7 @@ class SubDocumentProcessing: from modules.shared.debugLogger import writeDebugFile import json as _json jsonStr = _json.dumps(mergedJsonDocument, ensure_ascii=False, indent=2) - writeDebugFile(jsonStr, "extraction_merged_json", mergedJsonDocument) + writeDebugFile(jsonStr, "extractionMergedJson") except Exception: pass @@ -225,6 +225,7 @@ class SubDocumentProcessing: """ Process documents with per-chunk AI calls and merge results in JSON mode. Uses a custom prompt instead of the default extraction prompt. + Enhanced with partial results continuation logic. """ if not documents: return {"metadata": {"title": "Empty Document"}, "sections": []} @@ -305,6 +306,199 @@ class SubDocumentProcessing: logger.error(f"Error in per-chunk JSON processing: {str(e)}") return {"metadata": {"title": "Error Document"}, "sections": []} + async def processDocumentsWithContinuation( + self, + documents: List[ChatDocument], + custom_prompt: str, + options: Optional[AiCallOptions] = None + ) -> Dict[str, Any]: + """ + Process documents with partial results continuation logic. + Handles AI responses that indicate partial completion and loops until complete. + """ + if not documents: + return {"metadata": {"title": "Empty Document"}, "sections": []} + + logger.info("Starting document processing with continuation logic") + + # Build enhanced prompt with continuation instructions + enhanced_prompt = self._buildContinuationPrompt(custom_prompt) + + # Process with continuation logic + return await self._processWithContinuationLoop(documents, enhanced_prompt, options) + + def _buildContinuationPrompt(self, base_prompt: str) -> str: + """ + Build a prompt that includes partial results continuation instructions. + """ + continuation_instructions = """ + +IMPORTANT CHUNKING LOGIC: +- If the response is too large to generate completely in one response, set "continue": true +- When "continue": true, include a "continuation_context" field with: + - "last_section_id": "id of the last completed section" + - "last_element_index": "index of the last completed element in that section" + - "remaining_requirements": "brief description of what still needs to be generated" +- The AI will be called again with this context to continue generation +- Only set "continue": false when the response is completely generated + +OUTPUT FORMAT: Return only valid JSON in this exact structure: +{ + "metadata": { + "title": "Document Title" + }, + "sections": [ + { + "id": "section_1", + "content_type": "paragraph", + "elements": [ + { + "text": "This is the actual content that should be generated." + } + ], + "order": 1 + } + ], + "continue": false, + "continuation_context": { + "last_section_id": "section_1", + "last_element_index": 0, + "remaining_requirements": "description of what still needs to be generated" + } +} + +The AI should generate content using the canonical format with "sections" and "elements". +""" + + return f"{base_prompt}{continuation_instructions}" + + async def _processWithContinuationLoop( + self, + documents: List[ChatDocument], + enhanced_prompt: str, + options: Optional[AiCallOptions] = None + ) -> Dict[str, Any]: + """ + Process documents with continuation loop until complete. + """ + max_iterations = 10 # Prevent infinite loops + iteration = 0 + accumulated_sections = [] + continuation_context = None + + while iteration < max_iterations: + iteration += 1 + logger.info(f"Continuation iteration {iteration}/{max_iterations}") + + # Build prompt for this iteration + if continuation_context: + iteration_prompt = self._buildContinuationIterationPrompt( + enhanced_prompt, continuation_context, accumulated_sections + ) + else: + iteration_prompt = enhanced_prompt + + # Process documents for this iteration + try: + # Use the existing processing method + result = await self.processDocumentsPerChunkJsonWithPrompt( + documents, iteration_prompt, options + ) + + # Check if this is a valid JSON response + if not isinstance(result, dict): + logger.warning(f"Iteration {iteration}: Invalid result type, stopping") + break + + # Extract sections from result + sections = result.get("sections", []) + if not sections: + logger.warning(f"Iteration {iteration}: No sections found, stopping") + break + + # Add sections to accumulated results + for section in sections: + # Update section order to maintain sequence + section["order"] = len(accumulated_sections) + 1 + accumulated_sections.append(section) + + # Check if continuation is needed + continue_flag = result.get("continue", False) + continuation_context = result.get("continuation_context") + + logger.info(f"Iteration {iteration}: Added {len(sections)} sections, continue={continue_flag}") + + if not continue_flag: + logger.info(f"Continuation complete after {iteration} iterations") + break + + if not continuation_context: + logger.warning(f"Iteration {iteration}: continue=true but no continuation_context, stopping") + break + + except Exception as e: + logger.error(f"Iteration {iteration} failed: {str(e)}") + break + + if iteration >= max_iterations: + logger.warning(f"Continuation stopped after maximum iterations ({max_iterations})") + + # Build final result + final_result = { + "metadata": { + "title": "Generated Document", + "total_sections": len(accumulated_sections), + "iterations": iteration, + "continuation_used": iteration > 1 + }, + "sections": accumulated_sections, + "continue": False + } + + logger.info(f"Final result: {len(accumulated_sections)} sections from {iteration} iterations") + return final_result + + def _buildContinuationIterationPrompt( + self, + base_prompt: str, + continuation_context: Dict[str, Any], + accumulated_sections: List[Dict[str, Any]] + ) -> str: + """ + Build a prompt for continuation iteration with context. + """ + last_section_id = continuation_context.get("last_section_id", "") + last_element_index = continuation_context.get("last_element_index", 0) + remaining_requirements = continuation_context.get("remaining_requirements", "") + + # Build context of what's already been generated + context_summary = "PREVIOUSLY GENERATED CONTENT:\n" + for i, section in enumerate(accumulated_sections[-3:]): # Show last 3 sections for context + context_summary += f"Section {i+1}: {section.get('id', 'unknown')}\n" + if 'elements' in section and section['elements']: + first_element = section['elements'][0] + if 'text' in first_element: + preview = first_element['text'][:100] + "..." if len(first_element['text']) > 100 else first_element['text'] + context_summary += f" Preview: {preview}\n" + + continuation_prompt = f""" +{base_prompt} + +{context_summary} + +CONTINUATION INSTRUCTIONS: +- Continue from where you left off +- Last completed section: {last_section_id} +- Last completed element index: {last_element_index} +- Remaining requirements: {remaining_requirements} +- Generate the next part of the content +- Maintain consistency with previously generated content +- Use the same JSON format as before +- Set "continue": true if more content is needed, false if complete +""" + + return continuation_prompt + async def callAiText( self, prompt: str, @@ -522,14 +716,8 @@ class SubDocumentProcessing: # Save extraction prompt and response to debug try: from modules.shared.debugLogger import writeDebugFile - debugData = { - "chunk_index": chunk_index, - "mime_type": part.mimeType, - "type_group": part.typeGroup, - "context_length": len(part.data) if part.data else 0 - } - writeDebugFile(augmented_prompt, f"extraction_chunk_{chunk_index}", debugData) - writeDebugFile(ai_result or '', f"extraction_chunk_{chunk_index}_response") + writeDebugFile(augmented_prompt, f"extraction-Chunk{chunk_index}-Prompt") + writeDebugFile(ai_result or '', f"extraction-Chunk{chunk_index}-Response") except Exception: pass @@ -629,14 +817,8 @@ class SubDocumentProcessing: # Save extraction prompt and response to debug try: from modules.shared.debugLogger import writeDebugFile - debugData = { - "chunk_index": chunk_index, - "mime_type": part.mimeType, - "type_group": part.typeGroup, - "context_length": len(part.data) if part.data else 0 - } - writeDebugFile(augmented_prompt_text, f"extraction_chunk_{chunk_index}", debugData) - writeDebugFile(ai_result or '', f"extraction_chunk_{chunk_index}_response") + writeDebugFile(augmented_prompt_text, f"extractionChunk{chunk_index}-Prompt") + writeDebugFile(ai_result or '', f"extractionChunk{chunk_index}-Response") except Exception: pass diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py index dcc30a7f..f2dc89a0 100644 --- a/modules/services/serviceGeneration/mainServiceGeneration.py +++ b/modules/services/serviceGeneration/mainServiceGeneration.py @@ -372,59 +372,6 @@ class GenerationService: services=self.services ) - async def getGenericExtractionPrompt( - self, - outputFormat: str, - userPrompt: str, - title: str, - aiService=None - ) -> str: - """Get generic extraction prompt that works for both single and multi-file.""" - from .subPromptBuilder import buildGenericExtractionPrompt - return await buildGenericExtractionPrompt( - outputFormat=outputFormat, - userPrompt=userPrompt, - title=title, - aiService=aiService, - services=self.services - ) - - async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str: - """ - Get the format-specific extraction prompt for AI content extraction. - - Args: - outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx) - userPrompt: User's original prompt for report generation - title: Report title - aiService: AI service instance for intent extraction - - Returns: - str: Format-specific prompt for AI extraction - """ - try: - # Get the appropriate renderer for the format - renderer = self._getFormatRenderer(outputFormat) - if not renderer: - raise ValueError(f"Unsupported output format: {outputFormat}") - - # Build centralized prompt with generic rules + format-specific guidelines - from .subPromptBuilder import buildExtractionPrompt - extractionPrompt = await buildExtractionPrompt( - outputFormat=outputFormat, - renderer=renderer, - userPrompt=userPrompt, - title=title, - aiService=aiService, - services=self.services - ) - - logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters") - return extractionPrompt - - except Exception as e: - logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}") - raise async def renderAdaptiveReport( self, diff --git a/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py b/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py index 24728df4..bee1b82f 100644 --- a/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py +++ b/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py @@ -344,12 +344,8 @@ class BaseRenderer(ABC): # Save styling prompt and response to debug try: from modules.shared.debugLogger import writeDebugFile - debugData = { - "template_length": len(style_template), - "default_styles_keys": list(default_styles.keys()) if isinstance(default_styles, dict) else [] - } - writeDebugFile(style_template, "renderer_styling", debugData) - writeDebugFile(response.content or '', "renderer_styling_response") + writeDebugFile(style_template, "rendererStylingPrompt") + writeDebugFile(response.content or '', "rendererStylingResponse") except Exception: pass diff --git a/modules/services/serviceGeneration/renderers/rendererImage.py b/modules/services/serviceGeneration/renderers/rendererImage.py index f54e8f82..dab0496c 100644 --- a/modules/services/serviceGeneration/renderers/rendererImage.py +++ b/modules/services/serviceGeneration/renderers/rendererImage.py @@ -62,12 +62,7 @@ class RendererImage(BaseRenderer): # Save image generation prompt to debug try: from modules.shared.debugLogger import writeDebugFile - debugData = { - "title": document_title, - "user_prompt_length": len(user_prompt) if user_prompt else 0, - "extracted_content_keys": list(extracted_content.keys()) if isinstance(extracted_content, dict) else [] - } - writeDebugFile(image_prompt, "renderer_image_generation", debugData) + writeDebugFile(image_prompt, "rendererImageGenerationPrompt") except Exception: pass @@ -82,12 +77,7 @@ class RendererImage(BaseRenderer): # Save image generation response to debug try: from modules.shared.debugLogger import writeDebugFile - responseData = { - "success": image_result.get("success", False) if image_result else False, - "has_image_data": bool(image_result.get("image_data", "")) if image_result else False, - "result_keys": list(image_result.keys()) if isinstance(image_result, dict) else [] - } - writeDebugFile(str(image_result), "renderer_image_generation_response", responseData) + writeDebugFile(str(image_result), "rendererImageGenerationResponse") except Exception: pass @@ -114,7 +104,7 @@ class RendererImage(BaseRenderer): # Add user's original intent if available if user_prompt: - prompt_parts.append(f"User Request: {user_prompt}") + prompt_parts.append(f"User Request: {ai_service.sanitizePromptContent(user_prompt, 'userinput')}") # Add document title prompt_parts.append(f"Document Title: {title}") @@ -151,7 +141,7 @@ class RendererImage(BaseRenderer): # Fallback to minimal prompt if AI compression fails or is still too long minimal_prompt = f"Create a professional image representing: {title}" if user_prompt: - minimal_prompt += f" - {user_prompt}" + minimal_prompt += f" - {ai_service.sanitizePromptContent(user_prompt, 'userinput')}" # If even the minimal prompt is too long, truncate it if len(minimal_prompt) > 4000: diff --git a/modules/services/serviceGeneration/subPromptBuilder.py b/modules/services/serviceGeneration/subPromptBuilder.py index 4a5a8e97..bfb4052f 100644 --- a/modules/services/serviceGeneration/subPromptBuilder.py +++ b/modules/services/serviceGeneration/subPromptBuilder.py @@ -81,64 +81,20 @@ async def buildAdaptiveExtractionPrompt( ] } - # Single-file example data instead of schema - single_file_example = { - "metadata": { - "title": "Single Document Example", - "source_documents": ["doc_001"], - "extraction_method": "ai_extraction" - }, - "sections": [ - { - "id": "section_1", - "content_type": "heading", - "elements": [ - { - "level": 1, - "text": "1. SECTION TITLE" - } - ], - "order": 1 - }, - { - "id": "section_2", - "content_type": "paragraph", - "elements": [ - { - "text": "This is the actual content that should be extracted from the document." - } - ], - "order": 2 - }, - { - "id": "section_3", - "content_type": "table", - "elements": [ - { - "headers": ["Column 1", "Column 2"], - "rows": [["Value 1", "Value 2"]] - } - ], - "order": 3 - } - ] - } - - if promptAnalysis.get("is_multi_file", False): - # Multi-file prompt - adaptive_prompt = f""" -{userPrompt} + # UNIFIED APPROACH: Always use multi-document format (single doc = multi with n=1) + adaptive_prompt = f""" +{services.ai.sanitizePromptContent(userPrompt, 'userinput')} You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. -TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file. +TASK: Extract the actual content from the document and organize it into documents. For single documents, create one document entry. For multi-document requests, create multiple document entries. REQUIREMENTS: 1. Analyze the document content provided in the context below 2. Identify distinct sections in the document (by headings, topics, or logical breaks) -3. Create one JSON document entry for each section found +3. Create one or more JSON document entries based on the content structure 4. Extract the real content from each section (headings, paragraphs, lists, etc.) -5. Generate appropriate filenames for each section +5. Generate appropriate filenames for each document CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array. @@ -147,17 +103,18 @@ OUTPUT FORMAT: Return only valid JSON in this exact structure: IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have: - "id": unique identifier -- "title": section title from the document -- "filename": appropriate filename for the section +- "title": document title +- "filename": appropriate filename for the document - "sections": array of content sections DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level. INSTRUCTIONS: -- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document +- For single document requests: Create one document with all content in its sections +- For multi-document requests: Create multiple documents, each with relevant sections - Use actual section titles, headings, and text from the document -- Create meaningful filenames based on section content -- Ensure each section contains the complete content for that part of the document +- Create meaningful filenames based on content +- Ensure each section contains the complete content for that part - Do not use generic placeholder text like "Section 1", "Section 2" - Extract real headings, paragraphs, lists, and other content elements - CRITICAL: Return JSON with "documents" array, not "sections" array @@ -181,58 +138,12 @@ Image Analysis Requirements: Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. -Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. -""".strip() - else: - # Single-file prompt - use example data instead of schema - adaptive_prompt = f""" -{userPrompt} - -You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. - -TASK: Extract the actual content from the document and organize it into structured sections. - -REQUIREMENTS: -1. Analyze the document content provided in the context below -2. Extract all content and organize it into logical sections -3. Create structured JSON with sections containing the extracted content -4. Preserve the original structure and data - -OUTPUT FORMAT: Return only valid JSON in this exact structure: -{json.dumps(single_file_example, indent=2)} - -INSTRUCTIONS: -- Replace example data with actual content from the document -- Use actual headings, paragraphs, and text from the document -- Ensure all content is properly structured -- Do not use generic placeholder text -- Extract real content from the documents - -CONTEXT (Document Content): - -Content Types to Extract: -1. Tables: Extract all rows and columns with proper headers -2. Lists: Extract all items with proper nesting -3. Headings: Extract with appropriate levels -4. Paragraphs: Extract as structured text -5. Code: Extract code blocks with language identification -6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements - -Image Analysis Requirements: -- If you cannot analyze an image for any reason, explain why in the JSON response -- Describe everything you see in the image -- Include all text content, tables, logos, graphics, layout, and visual elements -- If the image is too small, corrupted, or unclear, explain this -- Always provide feedback - never return empty responses - -Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. - Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. """.strip() return adaptive_prompt -async def buildGenericExtractionPrompt( +async def buildGenerationPrompt( outputFormat: str, userPrompt: str, title: str, diff --git a/modules/shared/debugLogger.py b/modules/shared/debugLogger.py index fb8bde68..82dcb1c9 100644 --- a/modules/shared/debugLogger.py +++ b/modules/shared/debugLogger.py @@ -3,9 +3,8 @@ Simple debug logger for AI prompts and responses. Writes files chronologically to gateway/test-chat/ai/ with sequential numbering. """ import os -import json from datetime import datetime, UTC -from typing import Any, Optional +from typing import List, Optional def _getDebugDir() -> str: @@ -25,64 +24,16 @@ def _getNextSequenceNumber() -> int: return len(files) + 1 -def _formatJsonReadable(data: Any) -> str: - """ - Format JSON data in a readable line-by-line structure. - Handles both structured objects and text representations of dicts/lists. - - Args: - data: The data to format - - Returns: - Formatted string representation - """ - try: - # First try to parse if it's a string representation - if isinstance(data, str): - try: - # Try to parse as JSON first - parsed = json.loads(data) - data = parsed - except json.JSONDecodeError: - # Try to evaluate as Python literal (for dict/list strings) - try: - import ast - parsed = ast.literal_eval(data) - if isinstance(parsed, (dict, list)): - data = parsed - except (ValueError, SyntaxError): - # If all parsing fails, treat as plain text - pass - - # Convert to JSON string with proper indentation - if isinstance(data, (dict, list)): - jsonStr = json.dumps(data, ensure_ascii=False, default=str, indent=2) - else: - jsonStr = str(data) - - # Split into lines and add line numbers for better readability - lines = jsonStr.split('\n') - formattedLines = [] - - for i, line in enumerate(lines, 1): - # Add line number and proper spacing - lineNum = f"{i:3d}: " - formattedLines.append(f"{lineNum}{line}") - - return '\n'.join(formattedLines) - except Exception: - # Fallback to string representation if JSON formatting fails - return str(data) - - -def writeDebugFile(content: str, fileType: str, data: Optional[Any] = None) -> None: +def writeDebugFile(content: str, fileType: str, documents: Optional[List] = None) -> None: """ Write debug content to a file with sequential numbering. + Writes the content as-is since it's already the final integrated prompt. + Includes document list labels for tracing enhancement. Args: - content: The main content to write - fileType: Type of file (e.g., 'prompt', 'response', 'placeholders') - data: Optional additional data to include as JSON + content: The main content to write (already integrated) + fileType: Type of file (e.g., 'prompt_final', 'response') + documents: Optional list of documents for tracing """ try: debugDir = _getDebugDir() @@ -96,27 +47,23 @@ def writeDebugFile(content: str, fileType: str, data: Optional[Any] = None) -> N filename = f"{tsWithSeq}-{fileType}.txt" filepath = os.path.join(debugDir, filename) + # Build content with document tracing + debug_content = content + + # Add document list labels for tracing enhancement + if documents: + debug_content += "\n\n=== DOCUMENT LIST FOR TRACING ===\n" + for i, doc in enumerate(documents): + if hasattr(doc, 'fileName'): + debug_content += f"Document {i+1}: {doc.fileName} ({doc.mimeType})\n" + elif hasattr(doc, 'fileId'): + debug_content += f"Document {i+1}: {doc.fileId} ({getattr(doc, 'mimeType', 'unknown')})\n" + else: + debug_content += f"Document {i+1}: {str(doc)[:100]}...\n" + + # Write the content with document tracing with open(filepath, 'w', encoding='utf-8') as f: - f.write(content) - - # If structured data provided, also append a human-readable section to the main .txt - try: - if data is not None: - formatted = _formatJsonReadable(data) - with open(filepath, 'a', encoding='utf-8') as f: - f.write("\n\n=== FORMATTED DATA (human-readable) ===\n") - f.write(formatted) - f.write("\n") - except Exception: - pass - - # If additional data provided, write it as a separate JSON file with readable formatting - if data is not None: - jsonFilename = f"{tsWithSeq}-{fileType}_data.json" - jsonFilepath = os.path.join(debugDir, jsonFilename) - with open(jsonFilepath, 'w', encoding='utf-8') as f: - formattedData = _formatJsonReadable(data) - f.write(formattedData) + f.write(debug_content) except Exception as e: # Silent fail - don't break the main flow diff --git a/modules/workflows/processing/adaptive/contentValidator.py b/modules/workflows/processing/adaptive/contentValidator.py index 0aa634c3..91896373 100644 --- a/modules/workflows/processing/adaptive/contentValidator.py +++ b/modules/workflows/processing/adaptive/contentValidator.py @@ -125,10 +125,6 @@ DELIVERED CONTENT TO CHECK: documents=None, options=request_options ) - # Write validation prompt/response to debug - from modules.shared.debugLogger import writeDebugFile - writeDebugFile(validationPrompt, "validation_content_prompt") - writeDebugFile(response or '', "validation_content_response") # No retries or correction prompts here; parse-or-fail below diff --git a/modules/workflows/processing/adaptive/intentAnalyzer.py b/modules/workflows/processing/adaptive/intentAnalyzer.py index b1512e18..7d21b8d1 100644 --- a/modules/workflows/processing/adaptive/intentAnalyzer.py +++ b/modules/workflows/processing/adaptive/intentAnalyzer.py @@ -30,7 +30,7 @@ class IntentAnalyzer: analysisPrompt = f""" You are an intent analyzer. Analyze the user's request to understand what they want delivered. -USER REQUEST: {userPrompt} +USER REQUEST: {self.services.ai.sanitizePromptContent(userPrompt, 'userinput')} CONTEXT: {getattr(context.task_step, 'objective', '') if hasattr(context, 'task_step') and context.task_step else ''} @@ -62,17 +62,12 @@ CRITICAL: Respond with ONLY the JSON object below. Do not include any explanator from modules.datamodels.datamodelAi import AiCallOptions, OperationType request_options = AiCallOptions() request_options.operationType = OperationType.GENERAL - # Write prompt to debug - from modules.shared.debugLogger import writeDebugFile - writeDebugFile(analysisPrompt, "intent_prompt") response = await self.services.ai.callAi( prompt=analysisPrompt, documents=None, options=request_options ) - # Write response to debug - writeDebugFile(response or '', "intent_response") # No retries or correction prompts here; parse-or-fail below diff --git a/modules/workflows/processing/core/taskPlanner.py b/modules/workflows/processing/core/taskPlanner.py index 5feb1137..3bea9c76 100644 --- a/modules/workflows/processing/core/taskPlanner.py +++ b/modules/workflows/processing/core/taskPlanner.py @@ -94,10 +94,6 @@ class TaskPlanner: taskPlanningPromptTemplate = bundle.prompt placeholders = bundle.placeholders - # Write task planning prompt to debug - from modules.shared.debugLogger import writeDebugFile - writeDebugFile(taskPlanningPromptTemplate, "taskplan_prompt", placeholders) - # Centralized AI call: Task planning (quality, detailed) with placeholders options = AiCallOptions( operationType=OperationType.GENERATE_PLAN, @@ -118,9 +114,6 @@ class TaskPlanner: # Check if AI response is valid if not prompt: raise ValueError("AI service returned no response for task planning") - - # Write task planning response to debug - writeDebugFile(prompt or '', "taskplan_response") # Parse task plan response try: diff --git a/modules/workflows/processing/modes/modeReact.py b/modules/workflows/processing/modes/modeReact.py index 1fd8e8bb..cbe29eee 100644 --- a/modules/workflows/processing/modes/modeReact.py +++ b/modules/workflows/processing/modes/modeReact.py @@ -20,7 +20,6 @@ from modules.workflows.processing.shared.promptGenerationActionsReact import ( generateReactParametersPrompt, generateReactRefinementPrompt ) -from modules.shared.debugLogger import writeDebugFile from modules.workflows.processing.shared.placeholderFactory import extractReviewContent from modules.workflows.processing.adaptive import IntentAnalyzer, ContentValidator, LearningEngine, ProgressTracker from modules.workflows.processing.adaptive.adaptiveLearningEngine import AdaptiveLearningEngine @@ -191,10 +190,6 @@ class ReactMode(BaseMode): promptTemplate = bundle.prompt placeholders = bundle.placeholders - # Write action selection prompt to debug - from modules.shared.debugLogger import writeDebugFile - writeDebugFile(promptTemplate, "action_selection_prompt", placeholders) - # Centralized AI call for plan selection (use plan generation quality) options = AiCallOptions( operationType=OperationType.GENERATE_PLAN, @@ -211,8 +206,6 @@ class ReactMode(BaseMode): placeholders=placeholders, options=options ) - # Write action selection response to debug - writeDebugFile(response or '', "action_selection_response") jsonStart = response.find('{') if response else -1 jsonEnd = response.rfind('}') + 1 if response else 0 if jsonStart == -1 or jsonEnd == 0: @@ -306,9 +299,6 @@ class ReactMode(BaseMode): promptTemplate = bundle.prompt placeholders = bundle.placeholders - # Write parameters prompt to debug - writeDebugFile(promptTemplate, "parameters_prompt", placeholders) - # Centralized AI call for parameter suggestion (balanced analysis) options = AiCallOptions( operationType=OperationType.ANALYSE_CONTENT, @@ -367,12 +357,11 @@ class ReactMode(BaseMode): if 'language' not in parameters and hasattr(self.services, 'user') and getattr(self.services.user, 'language', None): parameters['language'] = self.services.user.language - # Write parameters response to debug + # Build merged parameters object mergedParamObj = { "schema": (paramObj.get('schema') if isinstance(paramObj, dict) else 'parameters_v1'), "parameters": parameters } - writeDebugFile(str(mergedParamObj), "parameters_response", mergedParamObj) # Build a synthetic ActionItem for execution routing and labels currentRound = getattr(self.workflow, 'currentRound', 0) @@ -625,9 +614,6 @@ class ReactMode(BaseMode): promptTemplate = bundle.prompt placeholders = bundle.placeholders - # Write refinement/validation prompt to debug - writeDebugFile(promptTemplate, "validation_refinement_prompt", placeholders) - # Centralized AI call for refinement decision (balanced analysis) options = AiCallOptions( operationType=OperationType.ANALYSE_CONTENT, @@ -644,8 +630,6 @@ class ReactMode(BaseMode): placeholders=placeholders, options=options ) - # Write refinement/validation response to debug - writeDebugFile(resp or '', "validation_refinement_response") # More robust JSON extraction if not resp: diff --git a/modules/workflows/processing/shared/promptGenerationActionsActionplan.py b/modules/workflows/processing/shared/promptGenerationActionsActionplan.py index c94179e6..fc44be85 100644 --- a/modules/workflows/processing/shared/promptGenerationActionsActionplan.py +++ b/modules/workflows/processing/shared/promptGenerationActionsActionplan.py @@ -36,6 +36,9 @@ def generateActionDefinitionPrompt(services, context: Any) -> PromptBundle: ## šŸ“‹ Context + ### User Language + {{KEY:USER_LANGUAGE}} + ### Task Objective {{KEY:USER_PROMPT}} @@ -44,10 +47,7 @@ def generateActionDefinitionPrompt(services, context: Any) -> PromptBundle: ### Available Connections {{KEY:AVAILABLE_CONNECTIONS_INDEX}} - - ### User Language - {{KEY:USER_LANGUAGE}} - + ### Workflow History {{KEY:WORKFLOW_HISTORY}} @@ -77,7 +77,7 @@ def generateActionDefinitionPrompt(services, context: Any) -> PromptBundle: "parameters": {}, "resultLabel": "round{current_round}_task{current_task}_action{action_number}_{descriptive_label}", "description": "What this action accomplishes", - "userMessage": "User-friendly message in {{KEY:USER_LANGUAGE}}" + "userMessage": "User-friendly message in language '{{KEY:USER_LANGUAGE}}'" } ] } @@ -118,7 +118,7 @@ def generateActionDefinitionPrompt(services, context: Any) -> PromptBundle: - **Make labels meaningful** for future reference ### User Messages - - **Write in user language** ({{KEY:USER_LANGUAGE}}) + - **Write in user language:** '{{KEY:USER_LANGUAGE}}' - **Explain what's happening** in user-friendly terms - **Keep messages concise** but informative @@ -171,7 +171,7 @@ def generateResultReviewPrompt(context: Any) -> PromptBundle: "met_criteria": ["criteria1", "criteria2"], "unmet_criteria": ["criteria3", "criteria4"], "confidence": 0.85, - "userMessage": "User-friendly message explaining the validation result" + "userMessage": "User-friendly message explaining the validation result in language '{{KEY:USER_LANGUAGE}}'" } ``` diff --git a/modules/workflows/processing/shared/promptGenerationActionsReact.py b/modules/workflows/processing/shared/promptGenerationActionsReact.py index 56f6aaa5..cec3c25d 100644 --- a/modules/workflows/processing/shared/promptGenerationActionsReact.py +++ b/modules/workflows/processing/shared/promptGenerationActionsReact.py @@ -24,6 +24,7 @@ def generateReactPlanSelectionPrompt(services, context: Any, learningEngine=None """Define placeholders first, then the template; return PromptBundle.""" placeholders: List[PromptPlaceholder] = [ PromptPlaceholder(label="USER_PROMPT", content=extractUserPrompt(context), summaryAllowed=False), + PromptPlaceholder(label="USER_LANGUAGE", content=extractUserLanguage(services), summaryAllowed=False), PromptPlaceholder(label="AVAILABLE_DOCUMENTS_SUMMARY", content=extractAvailableDocumentsSummary(services, context), summaryAllowed=True), PromptPlaceholder(label="AVAILABLE_METHODS", content=extractAvailableMethods(services), summaryAllowed=False), # Provide enriched history context for Stage 1 to craft parametersContext @@ -68,26 +69,20 @@ AVAILABLE_DOCUMENTS_INDEX: AVAILABLE_CONNECTIONS_INDEX: {{KEY:AVAILABLE_CONNECTIONS_INDEX}} -{{#if ADAPTIVE_GUIDANCE}} LEARNING-BASED GUIDANCE: {{KEY:ADAPTIVE_GUIDANCE}} -{{#if FAILURE_ANALYSIS}} FAILURE ANALYSIS: {{KEY:FAILURE_ANALYSIS}} -{{/if}} ESCALATION LEVEL: {{KEY:ESCALATION_LEVEL}} -{{/if}} REPLY: Return ONLY a JSON object with the following structure (no comments, no extra text). The chosen action MUST: - be the next logical incremental step toward fulfilling the objective - not attempt to complete the entire objective in one step - if producing files, target exactly one output format for this step - reference ONLY existing document IDs/labels from AVAILABLE_DOCUMENTS_INDEX -{{#if ADAPTIVE_GUIDANCE}} - learn from previous validation feedback and avoid repeated mistakes -{{/if}} {{ "action": "method.action_name", "actionObjective": "...", @@ -112,10 +107,8 @@ RULES: - Copy references EXACTLY as shown in AVAILABLE_DOCUMENTS_INDEX 6. For requiredConnection, use ONLY an exact label from AVAILABLE_CONNECTIONS_INDEX 7. Plan incrementally: if the overall intent needs multiple output formats (e.g., CSV and HTML), choose one format in this step and leave the other(s) for subsequent steps -{{#if ADAPTIVE_GUIDANCE}} 8. CRITICAL: Learn from previous validation feedback - avoid repeating the same mistakes 9. If previous attempts failed, consider alternative approaches or more specific parameters -{{/if}} """ return PromptBundle(prompt=template, placeholders=placeholders) @@ -197,6 +190,7 @@ Excludes documents/connections/history entirely. placeholders: List[PromptPlaceholder] = [ PromptPlaceholder(label="ACTION_OBJECTIVE", content=actionObjective, summaryAllowed=False), PromptPlaceholder(label="SELECTED_ACTION", content=compoundActionName, summaryAllowed=False), + PromptPlaceholder(label="USER_LANGUAGE", content=extractUserLanguage(services), summaryAllowed=False), PromptPlaceholder(label="PARAMETERS_CONTEXT", content=(parametersContext or ""), summaryAllowed=True), PromptPlaceholder(label="ACTION_PARAMETERS", content=actionParametersText, summaryAllowed=False), PromptPlaceholder(label="LEARNINGS", content=learningsText, summaryAllowed=True), @@ -225,19 +219,13 @@ CONTEXT AND OBJECTIVE: SELECTED_ACTION: {{KEY:SELECTED_ACTION}} -{{#if PARAMETER_GUIDANCE}} LEARNING-BASED PARAMETER GUIDANCE: {{KEY:PARAMETER_GUIDANCE}} -{{#if ATTEMPT_NUMBER}} ATTEMPT NUMBER: {{KEY:ATTEMPT_NUMBER}} -{{/if}} -{{#if FAILURE_ANALYSIS}} PREVIOUS FAILURE ANALYSIS: {{KEY:FAILURE_ANALYSIS}} -{{/if}} -{{/if}} REPLY (ONLY JSON): {{ @@ -264,19 +252,15 @@ INSTRUCTIONS: - Fill in appropriate values based on the context and objective - Do NOT invent new parameters - Do NOT include: documentList, connectionReference, history, documents, connections -{{#if PARAMETER_GUIDANCE}} - CRITICAL: Follow the learning-based parameter guidance above - Learn from previous validation failures and adjust parameters accordingly -{{/if}} RULES: - Return ONLY JSON (no markdown, no prose) - Use ONLY the exact parameter names listed in REQUIRED PARAMETERS FOR THIS ACTION - Do NOT add any parameters not listed above - Do NOT add nested objects or custom fields -{{#if PARAMETER_GUIDANCE}} - Apply learning insights to avoid repeated parameter mistakes -{{/if}} """ return PromptBundle(prompt=template, placeholders=placeholders) @@ -285,6 +269,7 @@ def generateReactRefinementPrompt(services, context: Any, reviewContent: str) -> """Define placeholders first, then the template; return PromptBundle.""" placeholders: List[PromptPlaceholder] = [ PromptPlaceholder(label="USER_PROMPT", content=extractUserPrompt(context), summaryAllowed=False), + PromptPlaceholder(label="USER_LANGUAGE", content=extractUserLanguage(services), summaryAllowed=False), PromptPlaceholder(label="REVIEW_CONTENT", content=reviewContent, summaryAllowed=True), ] diff --git a/modules/workflows/processing/shared/promptGenerationTaskplan.py b/modules/workflows/processing/shared/promptGenerationTaskplan.py index eb34df66..e8d1ca77 100644 --- a/modules/workflows/processing/shared/promptGenerationTaskplan.py +++ b/modules/workflows/processing/shared/promptGenerationTaskplan.py @@ -75,7 +75,7 @@ Break down user requests into logical, executable task steps. ```json { "overview": "Brief description of the overall plan", - "userMessage": "User-friendly message explaining the task plan (use {{KEY:USER_LANGUAGE}} language)", + "userMessage": "User-friendly message explaining the task plan in language '{{KEY:USER_LANGUAGE}}'", "tasks": [ { "id": "task_1", @@ -83,7 +83,7 @@ Break down user requests into logical, executable task steps. "dependencies": ["task_0"], "success_criteria": ["measurable criteria 1", "measurable criteria 2"], "estimated_complexity": "low|medium|high", - "userMessage": "What this task will accomplish" + "userMessage": "What this task will accomplish in language '{{KEY:USER_LANGUAGE}}'" } ] } diff --git a/modules/workflows/processing/shared/securityUtils.py b/modules/workflows/processing/shared/securityUtils.py deleted file mode 100644 index 8e632709..00000000 --- a/modules/workflows/processing/shared/securityUtils.py +++ /dev/null @@ -1,216 +0,0 @@ -""" -Security utilities for AI prompt construction. -Provides secure content escaping to prevent prompt injection attacks. -""" - -import re -import json -import logging -from typing import Any, Union, List, Dict - -logger = logging.getLogger(__name__) - -def _escapeForAiPrompt(content: str) -> str: - """ - Securely escape content for AI prompts to prevent injection attacks. - - This function: - 1. Escapes all special characters that could break prompt structure - 2. Wraps content in secure delimiters - 3. Handles multi-line content safely - 4. Prevents quote injection and context breaking - - Args: - content: The content to escape - - Returns: - Safely escaped content wrapped in secure delimiters - """ - if not content: - return "" - - # Convert to string if not already - content_str = str(content) - - # Remove or escape dangerous characters that could break prompt structure - # This includes quotes, backslashes, and other special characters - escaped = content_str - - # Escape backslashes first (order matters) - escaped = escaped.replace('\\', '\\\\') - - # Escape quotes and other special characters - escaped = escaped.replace('"', '\\"') - escaped = escaped.replace("'", "\\'") - escaped = escaped.replace('\n', '\\n') - escaped = escaped.replace('\r', '\\r') - escaped = escaped.replace('\t', '\\t') - - # Remove or escape other potentially dangerous characters - # Remove control characters except newlines (already handled above) - escaped = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', escaped) - - # Wrap in secure delimiters with clear boundaries - # Using a unique delimiter pattern that's unlikely to appear in user content - secure_delimiter_start = "===USER_CONTENT_START===" - secure_delimiter_end = "===USER_CONTENT_END===" - - return f"{secure_delimiter_start}\n{escaped}\n{secure_delimiter_end}" - -def _escapeForJsonPrompt(content: Any) -> str: - """ - Securely escape content for JSON-based AI prompts. - - Args: - content: The content to escape (can be any type) - - Returns: - Safely escaped JSON string - """ - try: - # Convert to JSON string with proper escaping - json_str = json.dumps(content, ensure_ascii=False, separators=(',', ':')) - return json_str - except Exception as e: - logger.warning(f"Failed to escape content as JSON: {str(e)}") - # Fallback to string escaping - return _escapeForAiPrompt(str(content)) - -def _escapeForListPrompt(items: List[Any]) -> str: - """ - Securely escape a list of items for AI prompts. - - Args: - items: List of items to escape - - Returns: - Safely escaped list representation - """ - if not items: - return "[]" - - try: - escaped_items = [] - for item in items: - if isinstance(item, (dict, list)): - escaped_items.append(_escapeForJsonPrompt(item)) - else: - escaped_items.append(_escapeForAiPrompt(str(item))) - - return f"[{', '.join(escaped_items)}]" - except Exception as e: - logger.warning(f"Failed to escape list content: {str(e)}") - return "[]" - -def securePromptContent(content: Any, content_type: str = "text") -> str: - """ - Main function to securely escape content for AI prompts. - - Args: - content: The content to escape - content_type: Type of content ("text", "json", "list", "user_prompt", "document_content") - - Returns: - Safely escaped content ready for AI prompt insertion - """ - if content is None: - return "" - - try: - if content_type == "json": - return _escapeForJsonPrompt(content) - elif content_type == "list": - if isinstance(content, list): - return _escapeForListPrompt(content) - else: - return _escapeForAiPrompt(str(content)) - elif content_type in ["user_prompt", "document_content"]: - # Extra security for user-controlled content - escaped = _escapeForAiPrompt(str(content)) - # Add additional warning for AI - return f"āš ļø USER_CONTROLLED_CONTENT: {escaped}" - else: # content_type == "text" or default - return _escapeForAiPrompt(str(content)) - - except Exception as e: - logger.error(f"Error escaping content for AI prompt: {str(e)}") - # Return a safe fallback - return "[ERROR: Content could not be safely escaped]" - -def buildSecurePrompt(template: str, **kwargs) -> str: - """ - Build a secure AI prompt by safely inserting content into a template. - - Args: - template: The prompt template with {key} placeholders - **kwargs: Key-value pairs for template substitution - - Returns: - Securely constructed prompt - """ - try: - # Escape all values before substitution - escaped_kwargs = {} - for key, value in kwargs.items(): - if key.endswith('_json'): - escaped_kwargs[key] = securePromptContent(value, "json") - elif key.endswith('_list'): - escaped_kwargs[key] = securePromptContent(value, "list") - elif key in ['user_prompt', 'context', 'document_content', 'user_input']: - escaped_kwargs[key] = securePromptContent(value, "user_prompt") - else: - escaped_kwargs[key] = securePromptContent(value, "text") - - # Use safe string formatting - return template.format(**escaped_kwargs) - - except Exception as e: - logger.error(f"Error building secure prompt: {str(e)}") - return template # Return original template if escaping fails - -def validatePromptSecurity(prompt: str) -> Dict[str, Any]: - """ - Validate that a prompt is secure and doesn't contain injection patterns. - - Args: - prompt: The prompt to validate - - Returns: - Dictionary with validation results - """ - issues = [] - - # Check for unescaped quotes that could break JSON - if '"' in prompt and '\\"' not in prompt: - # Check if quotes are properly escaped - unescaped_quotes = re.findall(r'(?', # Special tokens - ] - - for pattern in injection_patterns: - if re.search(pattern, prompt, re.IGNORECASE): - issues.append(f"Potential injection pattern detected: {pattern}") - - # Check for proper content delimiters - if "===USER_CONTENT_START===" not in prompt and "===USER_CONTENT_END===" not in prompt: - # This might be okay for some prompts, but flag for review - if any(keyword in prompt.lower() for keyword in ['context', 'user', 'input', 'prompt']): - issues.append("User content may not be properly delimited") - - return { - "is_secure": len(issues) == 0, - "issues": issues, - "prompt_length": len(prompt), - "has_user_content_delimiters": "===USER_CONTENT_START===" in prompt - } diff --git a/modules/workflows/workflowManager.py b/modules/workflows/workflowManager.py index 6808f0fa..cb3a09b5 100644 --- a/modules/workflows/workflowManager.py +++ b/modules/workflows/workflowManager.py @@ -216,7 +216,7 @@ class WorkflowManager: " }\n" " ]\n" "}\n\n" - f"User message:\n{userInput.prompt}" + f"User message:\n{self.services.ai.sanitizePromptContent(userInput.prompt, 'userinput')}" ) # Call AI analyzer @@ -716,6 +716,7 @@ class WorkflowManager: logger.error(f"Error processing file ID {fileId}: {str(e)}") return documents + def _setUserLanguage(self, language: str) -> None: """Set user language for the service center""" self.services.user.language = language diff --git a/test_unified_architecture.py b/test_unified_architecture.py new file mode 100644 index 00000000..bf0e0750 --- /dev/null +++ b/test_unified_architecture.py @@ -0,0 +1,258 @@ +import asyncio +import sys +import os +from unittest.mock import AsyncMock, MagicMock + +# Add the project root to the sys.path +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) + +from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType +from modules.datamodels.datamodelChat import ChatDocument +from modules.services.serviceAi.subCoreAi import SubCoreAi + +class MockAiObjects: + def __init__(self, responses): + self.responses = responses + self.call_count = 0 + + async def call(self, request: AiCallRequest): + if self.call_count < len(self.responses): + response_content = self.responses[self.call_count] + self.call_count += 1 + mock_response = MagicMock() + mock_response.content = response_content + mock_response.modelName = "mock-model" + mock_response.priceUsd = 0.001 + mock_response.processingTime = 0.1 + print(f" Mock AI Call {self.call_count}: Responding with partial result (length: {len(response_content)})") + return mock_response + else: + print(" Mock AI Call: No more mock responses, returning empty.") + mock_response = MagicMock() + mock_response.content = "" + return mock_response + +class MockServices: + def __init__(self): + self.currentWorkflow = MagicMock() + self.currentWorkflow.id = "test_workflow_123" + self.workflow = MagicMock() + self.workflow.createProgressLogger.return_value = MagicMock() + self.workflow.storeWorkflowStat = AsyncMock() + self.ai = MagicMock() + self.ai.sanitizePromptContent.side_effect = lambda content, type: content + self.utils = MagicMock() + self.utils.debugLogToFile.side_effect = lambda msg, tag: print(f" DEBUG ({tag}): {msg}") + self.utils.configGet.return_value = False # Disable debug files for tests + +class MockDocumentProcessor: + async def callAiText(self, prompt, documents, options): + return "Extracted content from documents: Sample text content" + +async def test_unified_architecture(): + print("\n=== Testing Unified Architecture ===") + + # Mock responses: 1 for generation prompt building + 2 for actual generation + mock_responses = [ + # Response 1: Generation prompt building + "Generate JSON content that creates a structured document with prime numbers in a table format. Use the canonical JSON format with sections and elements.", + + # Response 2: First part of generation + """{ + "metadata": { + "title": "Prime Numbers List", + "splitStrategy": "single_document", + "source_documents": [], + "extraction_method": "ai_generation" + }, + "documents": [ + { + "id": "doc_primes_1_500", + "title": "Prime Numbers 1-500", + "filename": "primes_1_500.docx", + "sections": [ + { + "id": "section_1", + "content_type": "table", + "elements": [ + { + "headers": ["Number", "Prime"], + "rows": [ + ["1", "2"], ["2", "3"], ["3", "5"], ["4", "7"], ["5", "11"] + ] + } + ], + "order": 1 + } + ] + } + ] +} [CONTINUE: Generate remaining prime numbers from 501 to 1000]""", + + # Response 3: Second part of generation + """{ + "metadata": { + "title": "Prime Numbers List", + "splitStrategy": "single_document", + "source_documents": [], + "extraction_method": "ai_generation" + }, + "documents": [ + { + "id": "doc_primes_501_1000", + "title": "Prime Numbers 501-1000", + "filename": "primes_501_1000.docx", + "sections": [ + { + "id": "section_2", + "content_type": "table", + "elements": [ + { + "headers": ["Number", "Prime"], + "rows": [ + ["501", "3571"], ["502", "3572"], ["503", "3581"] + ] + } + ], + "order": 2 + } + ] + } + ] +}""" + ] + + mock_ai_objects = MockAiObjects(mock_responses) + mock_services = MockServices() + mock_document_processor = MockDocumentProcessor() + + core_ai_service = SubCoreAi(mock_services, mock_ai_objects) + + prompt = "Generate the first 1000 prime numbers and arrange them in a structured table format." + options = AiCallOptions(operationType=OperationType.GENERATE_CONTENT) + output_format = "docx" + title = "Prime Numbers List" + + print(f"User Prompt: '{prompt}'") + print("Testing unified architecture with direct generation (no documents)...") + + # Test the unified generation method directly + result = await core_ai_service._callAiUnifiedGeneration(prompt, None, options, output_format, title) + + print("\n--- Generated JSON Result ---") + print(f"Result length: {len(result)} characters") + print(f"Result preview: {result[:300]}...") + + # Verify it's valid JSON + import json + try: + parsed_result = json.loads(result) + print(f"āœ… Valid JSON with {len(parsed_result.get('documents', []))} documents") + + # Verify it's using the multi-document format + if "documents" in parsed_result and "metadata" in parsed_result: + print("āœ… Using unified multi-document format") + print("āœ… Architecture is properly unified!") + return True + else: + print("āŒ Not using multi-document format") + return False + except json.JSONDecodeError as e: + print(f"āŒ Invalid JSON: {str(e)}") + return False + +async def test_with_documents(): + print("\n=== Testing Unified Architecture WITH Documents ===") + + # Mock responses: 1 for generation prompt building + 1 for actual generation + mock_responses = [ + # Response 1: Generation prompt building + "Generate JSON content that creates a comprehensive fruit analysis report based on the extracted content. Use the canonical JSON format with sections and elements.", + + # Response 2: Generation with extracted content + """{ + "metadata": { + "title": "Fruit Analysis Report", + "splitStrategy": "single_document", + "source_documents": ["doc1"], + "extraction_method": "ai_generation" + }, + "documents": [ + { + "id": "doc_fruit_analysis", + "title": "Fruit Analysis Report", + "filename": "fruit_analysis.docx", + "sections": [ + { + "id": "section_1", + "content_type": "paragraph", + "elements": [ + { + "text": "Based on the extracted content, here is a comprehensive fruit analysis..." + } + ], + "order": 1 + } + ] + } + ] +}""" + ] + + mock_ai_objects = MockAiObjects(mock_responses) + mock_services = MockServices() + mock_document_processor = MockDocumentProcessor() + + core_ai_service = SubCoreAi(mock_services, mock_ai_objects) + + prompt = "Extract all fruit information and create a comprehensive analysis report." + options = AiCallOptions(operationType=OperationType.GENERATE_CONTENT) + output_format = "docx" + title = "Fruit Analysis Report" + + print(f"User Prompt: '{prompt}'") + print("Testing unified architecture with document extraction...") + + # Test the unified generation method with extracted content + result = await core_ai_service._callAiUnifiedGeneration(prompt, "Sample fruit data: apples, oranges, bananas", options, output_format, title) + + print("\n--- Generated JSON Result ---") + print(f"Result length: {len(result)} characters") + print(f"Result preview: {result[:300]}...") + + # Verify it's valid JSON + import json + try: + parsed_result = json.loads(result) + print(f"āœ… Valid JSON with {len(parsed_result.get('documents', []))} documents") + + # Verify it's using the multi-document format + if "documents" in parsed_result and "metadata" in parsed_result: + print("āœ… Using unified multi-document format") + print("āœ… Architecture is properly unified!") + return True + else: + print("āŒ Not using multi-document format") + return False + except json.JSONDecodeError as e: + print(f"āŒ Invalid JSON: {str(e)}") + return False + +async def main(): + print("šŸš€ Testing Unified Architecture Implementation") + print("=" * 60) + + success1 = await test_unified_architecture() + success2 = await test_with_documents() + + if success1 and success2: + print("\nšŸŽ‰ ALL TESTS PASSED! Unified architecture is properly implemented.") + print("āœ… Single document = multi-document with n=1") + print("āœ… Always uses multi-document JSON format") + print("āœ… Continuation logic works for long responses") + print("āœ… Both scenarios (with/without documents) work") + else: + print("\nāŒ Some tests failed. Please check the implementation.") + +if __name__ == "__main__": + asyncio.run(main())