unified centralized ai generation service implemented, start testing

This commit is contained in:
ValueOn AG 2025-10-19 01:04:41 +02:00
parent 6b7094c84d
commit 11522bd763
19 changed files with 1237 additions and 1372 deletions

View file

@ -1,4 +1,5 @@
import logging
import re
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import PromptPlaceholder
@ -189,3 +190,69 @@ class AiService:
prompt, documents, placeholders, options, outputFormat, title,
documentProcessor, documentGenerator
)
def sanitizePromptContent(self, content: str, contentType: str = "text") -> str:
"""
Centralized prompt content sanitization to prevent injection attacks and ensure safe presentation.
This is the single source of truth for all prompt sanitization across the system.
Replaces all scattered sanitization functions with a unified approach.
Args:
content: The content to sanitize
contentType: Type of content ("text", "userinput", "json", "document")
Returns:
Safely sanitized content ready for AI prompt insertion
"""
if not content:
return ""
try:
# Convert to string if not already
content_str = str(content)
# Remove null bytes and control characters (except newlines and tabs)
sanitized = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', content_str)
# Handle different content types with appropriate sanitization
if contentType == "userinput":
# Extra security for user-controlled content
# Escape curly braces to prevent placeholder injection
sanitized = sanitized.replace('{', '{{').replace('}', '}}')
# Escape quotes and wrap in single quotes
sanitized = sanitized.replace('"', '\\"').replace("'", "\\'")
return f"'{sanitized}'"
elif contentType == "json":
# For JSON content, escape quotes and backslashes
sanitized = sanitized.replace('\\', '\\\\')
sanitized = sanitized.replace('"', '\\"')
sanitized = sanitized.replace('\n', '\\n')
sanitized = sanitized.replace('\r', '\\r')
sanitized = sanitized.replace('\t', '\\t')
elif contentType == "document":
# For document content, escape special characters
sanitized = sanitized.replace('\\', '\\\\')
sanitized = sanitized.replace('"', '\\"')
sanitized = sanitized.replace("'", "\\'")
sanitized = sanitized.replace('\n', '\\n')
sanitized = sanitized.replace('\r', '\\r')
sanitized = sanitized.replace('\t', '\\t')
else: # contentType == "text" or default
# Basic text sanitization
sanitized = sanitized.replace('\\', '\\\\')
sanitized = sanitized.replace('"', '\\"')
sanitized = sanitized.replace("'", "\\'")
sanitized = sanitized.replace('\n', '\\n')
sanitized = sanitized.replace('\r', '\\r')
sanitized = sanitized.replace('\t', '\\t')
return sanitized
except Exception as e:
logger.error(f"Error sanitizing prompt content: {str(e)}")
# Return a safe fallback
return "[ERROR: Content could not be safely sanitized]"

View file

@ -75,38 +75,105 @@ class SubCoreAi:
else:
full_prompt = prompt
# Timestamp-only prompt debug writing removed
# Check for unresolved placeholders and clean them up
try:
import re
# Find only {{KEY:...}} patterns that need to be removed
unresolved_placeholders = re.findall(r'\{\{KEY:[^}]+\}\}', full_prompt)
if unresolved_placeholders:
logger.warning(f"Found unresolved KEY placeholders in prompt: {unresolved_placeholders}")
# Remove only {{KEY:...}} patterns, leave other {{...}} content intact
full_prompt = re.sub(r'\{\{KEY:[^}]+\}\}', '', full_prompt)
# Clean up extra whitespace
full_prompt = re.sub(r'\n\s*\n\s*\n', '\n\n', full_prompt)
full_prompt = full_prompt.strip()
logger.info("Cleaned up unresolved KEY placeholders from prompt")
except Exception as e:
logger.warning(f"Error cleaning up prompt placeholders: {str(e)}")
# Log the final integrated prompt that AI will receive
try:
from modules.shared.debugLogger import writeDebugFile
# Determine the prompt type based on operation type
if options.operationType == OperationType.GENERATE_PLAN:
prompt_type = "taskplanPrompt"
elif options.operationType == OperationType.ANALYSE_CONTENT:
prompt_type = "analysisPrompt"
else:
prompt_type = "aiPrompt"
writeDebugFile(full_prompt, prompt_type, documents)
except Exception:
pass # Don't fail on debug logging
except Exception:
pass
# Handle document generation with specific output format
# Handle document generation with specific output format using unified approach
if outputFormat and documentGenerator:
result = await documentGenerator.callAiWithDocumentGeneration(prompt, documents, options, outputFormat, title)
# Use unified generation method for all document generation
if documents and len(documents) > 0:
# Extract content from documents first
logger.info(f"Extracting content from {len(documents)} documents")
extracted_content = await documentProcessor.callAiText(full_prompt, documents, options)
# Generate with extracted content
generated_json = await self._callAiUnifiedGeneration(full_prompt, extracted_content, options, outputFormat, title)
else:
# Direct generation without documents
logger.info("No documents provided - using direct generation")
generated_json = await self._callAiUnifiedGeneration(full_prompt, None, options, outputFormat, title)
# Parse the generated JSON
try:
import json
generated_data = json.loads(generated_json)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse generated JSON: {str(e)}")
return {"success": False, "error": f"Generated content is not valid JSON: {str(e)}"}
# Render to final format using the existing renderer
try:
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
generationService = GenerationService(self.services)
rendered_content, mime_type = await generationService.renderReport(
generated_data, outputFormat, title or "Generated Document", full_prompt, self
)
# Build result in the expected format
result = {
"success": True,
"content": generated_data,
"documents": [{
"documentName": f"generated.{outputFormat}",
"documentData": rendered_content,
"mimeType": mime_type,
"title": title or "Generated Document"
}],
"is_multi_file": False,
"format": outputFormat,
"title": title,
"split_strategy": "single",
"total_documents": 1,
"processed_documents": 1
}
# Log AI response for debugging
try:
if isinstance(result, dict) and 'content' in result:
self._writeAiResponseDebug(
label='ai_document_generation',
content=result['content'],
partIndex=1,
modelName=None, # Document generation doesn't return model info
continuation=False
)
from modules.shared.debugLogger import writeDebugFile
writeDebugFile(str(result), "documentGenerationResponse", documents)
except Exception:
pass
return result
except Exception as e:
logger.error(f"Error rendering document: {str(e)}")
return {"success": False, "error": f"Rendering failed: {str(e)}"}
if call_type == "planning":
result = await self._callAiPlanning(prompt, placeholders_dict, placeholders_meta, options)
# Log AI response for debugging
try:
self._writeAiResponseDebug(
label='ai_planning',
content=result or "",
partIndex=1,
modelName=None, # Planning doesn't return model info
continuation=False
)
from modules.shared.debugLogger import writeDebugFile
writeDebugFile(str(result or ""), "taskplanResponse", documents)
except Exception:
pass
return result
@ -125,31 +192,13 @@ class SubCoreAi:
if documentProcessor and documents:
result = await documentProcessor.callAiText(full_prompt, documents, options)
else:
# Fallback to direct AI call if no document processor available
request = AiCallRequest(
prompt=full_prompt,
context="",
options=options
)
response = await self.aiObjects.call(request)
result = response.content
# Emit stats for direct AI call
self.services.workflow.storeWorkflowStat(
self.services.currentWorkflow,
response,
f"ai.call.{options.operationType}"
)
# Enhanced direct AI call with partial results support
result = await self._callAiWithPartialResults(full_prompt, options)
# Log AI response for debugging (additional logging for text calls)
try:
self._writeAiResponseDebug(
label='ai_text_main',
content=result or "",
partIndex=1,
modelName=None, # Text calls already log internally
continuation=False
)
from modules.shared.debugLogger import writeDebugFile
writeDebugFile(str(result or ""), "aiTextResponse", documents)
except Exception:
pass
return result
@ -349,6 +398,253 @@ class SubCoreAi:
pass
return response.content
async def _callAiWithPartialResults(
self,
prompt: str,
options: AiCallOptions
) -> str:
"""
Call AI with partial results continuation logic for direct calls.
Handles cases where AI needs to generate large responses in chunks.
"""
logger.info("Starting direct AI call with partial results support")
# Build enhanced prompt with continuation instructions
enhanced_prompt = self._buildDirectContinuationPrompt(prompt)
# Process with continuation logic
return await self._processDirectWithContinuationLoop(enhanced_prompt, options)
def _buildDirectContinuationPrompt(self, base_prompt: str) -> str:
"""
Build a prompt for direct AI calls that includes partial results instructions.
"""
continuation_instructions = """
IMPORTANT: If your response is too large to generate completely in one response, you can deliver partial results and continue.
CONTINUATION LOGIC:
- If you cannot complete the full response, end your response with:
[CONTINUE: brief description of what still needs to be generated]
- The system will call you again to continue from where you left off
- Continue generating from the exact point where you stopped
- Maintain consistency with your previous partial response
- Only stop when you have generated the complete response
Examples:
Example - Code Generation:
If generating a large code file and you can only generate part of it:
- Generate the first part (imports, classes, functions)
- End with: [CONTINUE: Generate the remaining methods and main execution code]
- In the next call, continue from where you left off
Example - Documentation:
If writing comprehensive documentation and you can only generate sections 1-3:
- Generate sections 1-3 with full content
- End with: [CONTINUE: Generate sections 4-8 covering advanced topics and examples]
- In the next call, continue with sections 4-8
This allows you to handle very large responses that exceed normal limits.
"""
return f"{base_prompt}{continuation_instructions}"
async def _processDirectWithContinuationLoop(
self,
enhanced_prompt: str,
options: AiCallOptions
) -> str:
"""
Process direct AI call with continuation loop until complete.
"""
max_iterations = 10 # Prevent infinite loops
iteration = 0
accumulated_content = []
continuation_hint = None
while iteration < max_iterations:
iteration += 1
logger.info(f"Direct AI continuation iteration {iteration}/{max_iterations}")
# Build prompt for this iteration
if continuation_hint:
iteration_prompt = self._buildDirectContinuationIterationPrompt(
enhanced_prompt, continuation_hint, accumulated_content
)
else:
iteration_prompt = enhanced_prompt
# Make AI call for this iteration
try:
request = AiCallRequest(
prompt=iteration_prompt,
context="",
options=options
)
response = await self.aiObjects.call(request)
result = response.content
# Emit stats for this iteration
self.services.workflow.storeWorkflowStat(
self.services.currentWorkflow,
response,
f"ai.call.{options.operationType}.iteration_{iteration}"
)
if not result or not result.strip():
logger.warning(f"Iteration {iteration}: Empty response, stopping")
break
# Check for continuation marker
if "[CONTINUE:" in result:
# Extract the continuation hint
import re
continue_match = re.search(r'\[CONTINUE:\s*([^\]]+)\]', result)
if continue_match:
continuation_hint = continue_match.group(1).strip()
# Remove the continuation marker from the result
result = re.sub(r'\s*\[CONTINUE:[^\]]+\]', '', result).strip()
else:
continuation_hint = "Continue from where you left off"
# Add this partial result to accumulated content
if result.strip():
accumulated_content.append(result.strip())
logger.info(f"Iteration {iteration}: Partial result added, continue hint: {continuation_hint}")
else:
# No continuation marker - this is the final result
if result.strip():
accumulated_content.append(result.strip())
logger.info(f"Direct AI continuation complete after {iteration} iterations")
break
except Exception as e:
logger.error(f"Direct AI iteration {iteration} failed: {str(e)}")
break
if iteration >= max_iterations:
logger.warning(f"Direct AI continuation stopped after maximum iterations ({max_iterations})")
# For JSON responses, we need to merge them properly instead of concatenating
if accumulated_content:
import json
# Parse each part as JSON and merge them
merged_documents = []
merged_metadata = None
for content in accumulated_content:
parsed = json.loads(content)
if isinstance(parsed, dict):
# Extract metadata from first valid JSON
if merged_metadata is None and "metadata" in parsed:
merged_metadata = parsed["metadata"]
# Extract documents from this part
if "documents" in parsed and isinstance(parsed["documents"], list):
merged_documents.extend(parsed["documents"])
# Create final merged JSON - NO FALLBACK
final_result = json.dumps({
"metadata": merged_metadata or {
"title": "Generated Document",
"splitStrategy": "single_document",
"source_documents": [],
"extraction_method": "ai_generation"
},
"documents": merged_documents
}, indent=2)
else:
# Return empty JSON structure if no content
final_result = json.dumps({
"metadata": {
"title": "Generated Document",
"splitStrategy": "single_document",
"source_documents": [],
"extraction_method": "ai_generation"
},
"documents": []
}, indent=2)
logger.info(f"Final direct AI result: {len(accumulated_content)} parts from {iteration} iterations")
return final_result
def _buildDirectContinuationIterationPrompt(
self,
base_prompt: str,
continuation_hint: str,
accumulated_content: List[str]
) -> str:
"""
Build a prompt for continuation iteration with context.
"""
# Build context of what's already been generated
context_summary = "PREVIOUSLY GENERATED CONTENT:\n"
for i, content in enumerate(accumulated_content[-2:]): # Show last 2 parts for context
preview = content[:200] + "..." if len(content) > 200 else content
context_summary += f"Part {i+1}: {preview}\n"
continuation_prompt = f"""
{base_prompt}
{context_summary}
CONTINUATION INSTRUCTIONS:
- Continue from where you left off
- Continuation hint: {continuation_hint}
- Generate the next part of the content
- Maintain consistency with previously generated content
- End with [CONTINUE: description] if more content is needed
- End without [CONTINUE] if the response is complete
"""
return continuation_prompt
async def _callAiUnifiedGeneration(
self,
prompt: str,
extracted_content: Optional[str] = None,
options: Optional[AiCallOptions] = None,
outputFormat: str = "json",
title: str = "Generated Document"
) -> str:
"""
Unified generation method that handles both scenarios:
- With extracted content (from documents)
- Without extracted content (direct generation)
Always uses continuation logic for long responses.
Always returns standardized JSON format using the multi-document schema.
"""
if options is None:
options = AiCallOptions()
logger.info("Starting unified AI generation with continuation logic")
# Use the existing buildGenerationPrompt to get the proper canonical format instructions
from modules.services.serviceGeneration.subPromptBuilder import buildGenerationPrompt
# Build the generation prompt using the existing system
generation_prompt = await buildGenerationPrompt(
outputFormat=outputFormat,
userPrompt=prompt,
title=title,
aiService=self,
services=self.services
)
# If we have extracted content, prepend it to the prompt
if extracted_content:
generation_prompt = f"""EXTRACTED CONTENT FROM DOCUMENTS:
{extracted_content}
{generation_prompt}"""
# Use continuation logic for long responses
return await self._processDirectWithContinuationLoop(generation_prompt, options)
async def _callAiDirect(
self,
prompt: str,
@ -503,10 +799,6 @@ class SubCoreAi:
return full_prompt
def _writeAiResponseDebug(self, label: str, content: Any, partIndex: int = 1, modelName: str = None, continuation: bool = None) -> None:
"""Disabled verbose debug writing; only minimal files elsewhere."""
return
def _exceedsTokenLimit(self, text: str, model: ModelCapabilities, safety_margin: float) -> bool:
"""
Check if text exceeds model token limit with safety margin.

File diff suppressed because it is too large Load diff

View file

@ -107,7 +107,7 @@ class SubDocumentProcessing:
# Save merged extraction content to debug
try:
from modules.shared.debugLogger import writeDebugFile
writeDebugFile(mergedContent or '', "extraction_merged")
writeDebugFile(mergedContent or '', "extractionMergedText")
except Exception:
pass
@ -202,7 +202,7 @@ class SubDocumentProcessing:
from modules.shared.debugLogger import writeDebugFile
import json as _json
jsonStr = _json.dumps(mergedJsonDocument, ensure_ascii=False, indent=2)
writeDebugFile(jsonStr, "extraction_merged_json", mergedJsonDocument)
writeDebugFile(jsonStr, "extractionMergedJson")
except Exception:
pass
@ -225,6 +225,7 @@ class SubDocumentProcessing:
"""
Process documents with per-chunk AI calls and merge results in JSON mode.
Uses a custom prompt instead of the default extraction prompt.
Enhanced with partial results continuation logic.
"""
if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []}
@ -305,6 +306,199 @@ class SubDocumentProcessing:
logger.error(f"Error in per-chunk JSON processing: {str(e)}")
return {"metadata": {"title": "Error Document"}, "sections": []}
async def processDocumentsWithContinuation(
self,
documents: List[ChatDocument],
custom_prompt: str,
options: Optional[AiCallOptions] = None
) -> Dict[str, Any]:
"""
Process documents with partial results continuation logic.
Handles AI responses that indicate partial completion and loops until complete.
"""
if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []}
logger.info("Starting document processing with continuation logic")
# Build enhanced prompt with continuation instructions
enhanced_prompt = self._buildContinuationPrompt(custom_prompt)
# Process with continuation logic
return await self._processWithContinuationLoop(documents, enhanced_prompt, options)
def _buildContinuationPrompt(self, base_prompt: str) -> str:
"""
Build a prompt that includes partial results continuation instructions.
"""
continuation_instructions = """
IMPORTANT CHUNKING LOGIC:
- If the response is too large to generate completely in one response, set "continue": true
- When "continue": true, include a "continuation_context" field with:
- "last_section_id": "id of the last completed section"
- "last_element_index": "index of the last completed element in that section"
- "remaining_requirements": "brief description of what still needs to be generated"
- The AI will be called again with this context to continue generation
- Only set "continue": false when the response is completely generated
OUTPUT FORMAT: Return only valid JSON in this exact structure:
{
"metadata": {
"title": "Document Title"
},
"sections": [
{
"id": "section_1",
"content_type": "paragraph",
"elements": [
{
"text": "This is the actual content that should be generated."
}
],
"order": 1
}
],
"continue": false,
"continuation_context": {
"last_section_id": "section_1",
"last_element_index": 0,
"remaining_requirements": "description of what still needs to be generated"
}
}
The AI should generate content using the canonical format with "sections" and "elements".
"""
return f"{base_prompt}{continuation_instructions}"
async def _processWithContinuationLoop(
self,
documents: List[ChatDocument],
enhanced_prompt: str,
options: Optional[AiCallOptions] = None
) -> Dict[str, Any]:
"""
Process documents with continuation loop until complete.
"""
max_iterations = 10 # Prevent infinite loops
iteration = 0
accumulated_sections = []
continuation_context = None
while iteration < max_iterations:
iteration += 1
logger.info(f"Continuation iteration {iteration}/{max_iterations}")
# Build prompt for this iteration
if continuation_context:
iteration_prompt = self._buildContinuationIterationPrompt(
enhanced_prompt, continuation_context, accumulated_sections
)
else:
iteration_prompt = enhanced_prompt
# Process documents for this iteration
try:
# Use the existing processing method
result = await self.processDocumentsPerChunkJsonWithPrompt(
documents, iteration_prompt, options
)
# Check if this is a valid JSON response
if not isinstance(result, dict):
logger.warning(f"Iteration {iteration}: Invalid result type, stopping")
break
# Extract sections from result
sections = result.get("sections", [])
if not sections:
logger.warning(f"Iteration {iteration}: No sections found, stopping")
break
# Add sections to accumulated results
for section in sections:
# Update section order to maintain sequence
section["order"] = len(accumulated_sections) + 1
accumulated_sections.append(section)
# Check if continuation is needed
continue_flag = result.get("continue", False)
continuation_context = result.get("continuation_context")
logger.info(f"Iteration {iteration}: Added {len(sections)} sections, continue={continue_flag}")
if not continue_flag:
logger.info(f"Continuation complete after {iteration} iterations")
break
if not continuation_context:
logger.warning(f"Iteration {iteration}: continue=true but no continuation_context, stopping")
break
except Exception as e:
logger.error(f"Iteration {iteration} failed: {str(e)}")
break
if iteration >= max_iterations:
logger.warning(f"Continuation stopped after maximum iterations ({max_iterations})")
# Build final result
final_result = {
"metadata": {
"title": "Generated Document",
"total_sections": len(accumulated_sections),
"iterations": iteration,
"continuation_used": iteration > 1
},
"sections": accumulated_sections,
"continue": False
}
logger.info(f"Final result: {len(accumulated_sections)} sections from {iteration} iterations")
return final_result
def _buildContinuationIterationPrompt(
self,
base_prompt: str,
continuation_context: Dict[str, Any],
accumulated_sections: List[Dict[str, Any]]
) -> str:
"""
Build a prompt for continuation iteration with context.
"""
last_section_id = continuation_context.get("last_section_id", "")
last_element_index = continuation_context.get("last_element_index", 0)
remaining_requirements = continuation_context.get("remaining_requirements", "")
# Build context of what's already been generated
context_summary = "PREVIOUSLY GENERATED CONTENT:\n"
for i, section in enumerate(accumulated_sections[-3:]): # Show last 3 sections for context
context_summary += f"Section {i+1}: {section.get('id', 'unknown')}\n"
if 'elements' in section and section['elements']:
first_element = section['elements'][0]
if 'text' in first_element:
preview = first_element['text'][:100] + "..." if len(first_element['text']) > 100 else first_element['text']
context_summary += f" Preview: {preview}\n"
continuation_prompt = f"""
{base_prompt}
{context_summary}
CONTINUATION INSTRUCTIONS:
- Continue from where you left off
- Last completed section: {last_section_id}
- Last completed element index: {last_element_index}
- Remaining requirements: {remaining_requirements}
- Generate the next part of the content
- Maintain consistency with previously generated content
- Use the same JSON format as before
- Set "continue": true if more content is needed, false if complete
"""
return continuation_prompt
async def callAiText(
self,
prompt: str,
@ -522,14 +716,8 @@ class SubDocumentProcessing:
# Save extraction prompt and response to debug
try:
from modules.shared.debugLogger import writeDebugFile
debugData = {
"chunk_index": chunk_index,
"mime_type": part.mimeType,
"type_group": part.typeGroup,
"context_length": len(part.data) if part.data else 0
}
writeDebugFile(augmented_prompt, f"extraction_chunk_{chunk_index}", debugData)
writeDebugFile(ai_result or '', f"extraction_chunk_{chunk_index}_response")
writeDebugFile(augmented_prompt, f"extraction-Chunk{chunk_index}-Prompt")
writeDebugFile(ai_result or '', f"extraction-Chunk{chunk_index}-Response")
except Exception:
pass
@ -629,14 +817,8 @@ class SubDocumentProcessing:
# Save extraction prompt and response to debug
try:
from modules.shared.debugLogger import writeDebugFile
debugData = {
"chunk_index": chunk_index,
"mime_type": part.mimeType,
"type_group": part.typeGroup,
"context_length": len(part.data) if part.data else 0
}
writeDebugFile(augmented_prompt_text, f"extraction_chunk_{chunk_index}", debugData)
writeDebugFile(ai_result or '', f"extraction_chunk_{chunk_index}_response")
writeDebugFile(augmented_prompt_text, f"extractionChunk{chunk_index}-Prompt")
writeDebugFile(ai_result or '', f"extractionChunk{chunk_index}-Response")
except Exception:
pass

View file

@ -372,59 +372,6 @@ class GenerationService:
services=self.services
)
async def getGenericExtractionPrompt(
self,
outputFormat: str,
userPrompt: str,
title: str,
aiService=None
) -> str:
"""Get generic extraction prompt that works for both single and multi-file."""
from .subPromptBuilder import buildGenericExtractionPrompt
return await buildGenericExtractionPrompt(
outputFormat=outputFormat,
userPrompt=userPrompt,
title=title,
aiService=aiService,
services=self.services
)
async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
"""
Get the format-specific extraction prompt for AI content extraction.
Args:
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
userPrompt: User's original prompt for report generation
title: Report title
aiService: AI service instance for intent extraction
Returns:
str: Format-specific prompt for AI extraction
"""
try:
# Get the appropriate renderer for the format
renderer = self._getFormatRenderer(outputFormat)
if not renderer:
raise ValueError(f"Unsupported output format: {outputFormat}")
# Build centralized prompt with generic rules + format-specific guidelines
from .subPromptBuilder import buildExtractionPrompt
extractionPrompt = await buildExtractionPrompt(
outputFormat=outputFormat,
renderer=renderer,
userPrompt=userPrompt,
title=title,
aiService=aiService,
services=self.services
)
logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters")
return extractionPrompt
except Exception as e:
logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
raise
async def renderAdaptiveReport(
self,

View file

@ -344,12 +344,8 @@ class BaseRenderer(ABC):
# Save styling prompt and response to debug
try:
from modules.shared.debugLogger import writeDebugFile
debugData = {
"template_length": len(style_template),
"default_styles_keys": list(default_styles.keys()) if isinstance(default_styles, dict) else []
}
writeDebugFile(style_template, "renderer_styling", debugData)
writeDebugFile(response.content or '', "renderer_styling_response")
writeDebugFile(style_template, "rendererStylingPrompt")
writeDebugFile(response.content or '', "rendererStylingResponse")
except Exception:
pass

View file

@ -62,12 +62,7 @@ class RendererImage(BaseRenderer):
# Save image generation prompt to debug
try:
from modules.shared.debugLogger import writeDebugFile
debugData = {
"title": document_title,
"user_prompt_length": len(user_prompt) if user_prompt else 0,
"extracted_content_keys": list(extracted_content.keys()) if isinstance(extracted_content, dict) else []
}
writeDebugFile(image_prompt, "renderer_image_generation", debugData)
writeDebugFile(image_prompt, "rendererImageGenerationPrompt")
except Exception:
pass
@ -82,12 +77,7 @@ class RendererImage(BaseRenderer):
# Save image generation response to debug
try:
from modules.shared.debugLogger import writeDebugFile
responseData = {
"success": image_result.get("success", False) if image_result else False,
"has_image_data": bool(image_result.get("image_data", "")) if image_result else False,
"result_keys": list(image_result.keys()) if isinstance(image_result, dict) else []
}
writeDebugFile(str(image_result), "renderer_image_generation_response", responseData)
writeDebugFile(str(image_result), "rendererImageGenerationResponse")
except Exception:
pass
@ -114,7 +104,7 @@ class RendererImage(BaseRenderer):
# Add user's original intent if available
if user_prompt:
prompt_parts.append(f"User Request: {user_prompt}")
prompt_parts.append(f"User Request: {ai_service.sanitizePromptContent(user_prompt, 'userinput')}")
# Add document title
prompt_parts.append(f"Document Title: {title}")
@ -151,7 +141,7 @@ class RendererImage(BaseRenderer):
# Fallback to minimal prompt if AI compression fails or is still too long
minimal_prompt = f"Create a professional image representing: {title}"
if user_prompt:
minimal_prompt += f" - {user_prompt}"
minimal_prompt += f" - {ai_service.sanitizePromptContent(user_prompt, 'userinput')}"
# If even the minimal prompt is too long, truncate it
if len(minimal_prompt) > 4000:

View file

@ -81,64 +81,20 @@ async def buildAdaptiveExtractionPrompt(
]
}
# Single-file example data instead of schema
single_file_example = {
"metadata": {
"title": "Single Document Example",
"source_documents": ["doc_001"],
"extraction_method": "ai_extraction"
},
"sections": [
{
"id": "section_1",
"content_type": "heading",
"elements": [
{
"level": 1,
"text": "1. SECTION TITLE"
}
],
"order": 1
},
{
"id": "section_2",
"content_type": "paragraph",
"elements": [
{
"text": "This is the actual content that should be extracted from the document."
}
],
"order": 2
},
{
"id": "section_3",
"content_type": "table",
"elements": [
{
"headers": ["Column 1", "Column 2"],
"rows": [["Value 1", "Value 2"]]
}
],
"order": 3
}
]
}
if promptAnalysis.get("is_multi_file", False):
# Multi-file prompt
# UNIFIED APPROACH: Always use multi-document format (single doc = multi with n=1)
adaptive_prompt = f"""
{userPrompt}
{services.ai.sanitizePromptContent(userPrompt, 'userinput')}
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file.
TASK: Extract the actual content from the document and organize it into documents. For single documents, create one document entry. For multi-document requests, create multiple document entries.
REQUIREMENTS:
1. Analyze the document content provided in the context below
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
3. Create one JSON document entry for each section found
3. Create one or more JSON document entries based on the content structure
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
5. Generate appropriate filenames for each section
5. Generate appropriate filenames for each document
CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array.
@ -147,17 +103,18 @@ OUTPUT FORMAT: Return only valid JSON in this exact structure:
IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have:
- "id": unique identifier
- "title": section title from the document
- "filename": appropriate filename for the section
- "title": document title
- "filename": appropriate filename for the document
- "sections": array of content sections
DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level.
INSTRUCTIONS:
- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document
- For single document requests: Create one document with all content in its sections
- For multi-document requests: Create multiple documents, each with relevant sections
- Use actual section titles, headings, and text from the document
- Create meaningful filenames based on section content
- Ensure each section contains the complete content for that part of the document
- Create meaningful filenames based on content
- Ensure each section contains the complete content for that part
- Do not use generic placeholder text like "Section 1", "Section 2"
- Extract real headings, paragraphs, lists, and other content elements
- CRITICAL: Return JSON with "documents" array, not "sections" array
@ -181,58 +138,12 @@ Image Analysis Requirements:
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
""".strip()
else:
# Single-file prompt - use example data instead of schema
adaptive_prompt = f"""
{userPrompt}
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
TASK: Extract the actual content from the document and organize it into structured sections.
REQUIREMENTS:
1. Analyze the document content provided in the context below
2. Extract all content and organize it into logical sections
3. Create structured JSON with sections containing the extracted content
4. Preserve the original structure and data
OUTPUT FORMAT: Return only valid JSON in this exact structure:
{json.dumps(single_file_example, indent=2)}
INSTRUCTIONS:
- Replace example data with actual content from the document
- Use actual headings, paragraphs, and text from the document
- Ensure all content is properly structured
- Do not use generic placeholder text
- Extract real content from the documents
CONTEXT (Document Content):
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
""".strip()
return adaptive_prompt
async def buildGenericExtractionPrompt(
async def buildGenerationPrompt(
outputFormat: str,
userPrompt: str,
title: str,

View file

@ -3,9 +3,8 @@ Simple debug logger for AI prompts and responses.
Writes files chronologically to gateway/test-chat/ai/ with sequential numbering.
"""
import os
import json
from datetime import datetime, UTC
from typing import Any, Optional
from typing import List, Optional
def _getDebugDir() -> str:
@ -25,64 +24,16 @@ def _getNextSequenceNumber() -> int:
return len(files) + 1
def _formatJsonReadable(data: Any) -> str:
"""
Format JSON data in a readable line-by-line structure.
Handles both structured objects and text representations of dicts/lists.
Args:
data: The data to format
Returns:
Formatted string representation
"""
try:
# First try to parse if it's a string representation
if isinstance(data, str):
try:
# Try to parse as JSON first
parsed = json.loads(data)
data = parsed
except json.JSONDecodeError:
# Try to evaluate as Python literal (for dict/list strings)
try:
import ast
parsed = ast.literal_eval(data)
if isinstance(parsed, (dict, list)):
data = parsed
except (ValueError, SyntaxError):
# If all parsing fails, treat as plain text
pass
# Convert to JSON string with proper indentation
if isinstance(data, (dict, list)):
jsonStr = json.dumps(data, ensure_ascii=False, default=str, indent=2)
else:
jsonStr = str(data)
# Split into lines and add line numbers for better readability
lines = jsonStr.split('\n')
formattedLines = []
for i, line in enumerate(lines, 1):
# Add line number and proper spacing
lineNum = f"{i:3d}: "
formattedLines.append(f"{lineNum}{line}")
return '\n'.join(formattedLines)
except Exception:
# Fallback to string representation if JSON formatting fails
return str(data)
def writeDebugFile(content: str, fileType: str, data: Optional[Any] = None) -> None:
def writeDebugFile(content: str, fileType: str, documents: Optional[List] = None) -> None:
"""
Write debug content to a file with sequential numbering.
Writes the content as-is since it's already the final integrated prompt.
Includes document list labels for tracing enhancement.
Args:
content: The main content to write
fileType: Type of file (e.g., 'prompt', 'response', 'placeholders')
data: Optional additional data to include as JSON
content: The main content to write (already integrated)
fileType: Type of file (e.g., 'prompt_final', 'response')
documents: Optional list of documents for tracing
"""
try:
debugDir = _getDebugDir()
@ -96,27 +47,23 @@ def writeDebugFile(content: str, fileType: str, data: Optional[Any] = None) -> N
filename = f"{tsWithSeq}-{fileType}.txt"
filepath = os.path.join(debugDir, filename)
# Build content with document tracing
debug_content = content
# Add document list labels for tracing enhancement
if documents:
debug_content += "\n\n=== DOCUMENT LIST FOR TRACING ===\n"
for i, doc in enumerate(documents):
if hasattr(doc, 'fileName'):
debug_content += f"Document {i+1}: {doc.fileName} ({doc.mimeType})\n"
elif hasattr(doc, 'fileId'):
debug_content += f"Document {i+1}: {doc.fileId} ({getattr(doc, 'mimeType', 'unknown')})\n"
else:
debug_content += f"Document {i+1}: {str(doc)[:100]}...\n"
# Write the content with document tracing
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
# If structured data provided, also append a human-readable section to the main .txt
try:
if data is not None:
formatted = _formatJsonReadable(data)
with open(filepath, 'a', encoding='utf-8') as f:
f.write("\n\n=== FORMATTED DATA (human-readable) ===\n")
f.write(formatted)
f.write("\n")
except Exception:
pass
# If additional data provided, write it as a separate JSON file with readable formatting
if data is not None:
jsonFilename = f"{tsWithSeq}-{fileType}_data.json"
jsonFilepath = os.path.join(debugDir, jsonFilename)
with open(jsonFilepath, 'w', encoding='utf-8') as f:
formattedData = _formatJsonReadable(data)
f.write(formattedData)
f.write(debug_content)
except Exception as e:
# Silent fail - don't break the main flow

View file

@ -125,10 +125,6 @@ DELIVERED CONTENT TO CHECK:
documents=None,
options=request_options
)
# Write validation prompt/response to debug
from modules.shared.debugLogger import writeDebugFile
writeDebugFile(validationPrompt, "validation_content_prompt")
writeDebugFile(response or '', "validation_content_response")
# No retries or correction prompts here; parse-or-fail below

View file

@ -30,7 +30,7 @@ class IntentAnalyzer:
analysisPrompt = f"""
You are an intent analyzer. Analyze the user's request to understand what they want delivered.
USER REQUEST: {userPrompt}
USER REQUEST: {self.services.ai.sanitizePromptContent(userPrompt, 'userinput')}
CONTEXT: {getattr(context.task_step, 'objective', '') if hasattr(context, 'task_step') and context.task_step else ''}
@ -62,17 +62,12 @@ CRITICAL: Respond with ONLY the JSON object below. Do not include any explanator
from modules.datamodels.datamodelAi import AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
# Write prompt to debug
from modules.shared.debugLogger import writeDebugFile
writeDebugFile(analysisPrompt, "intent_prompt")
response = await self.services.ai.callAi(
prompt=analysisPrompt,
documents=None,
options=request_options
)
# Write response to debug
writeDebugFile(response or '', "intent_response")
# No retries or correction prompts here; parse-or-fail below

View file

@ -94,10 +94,6 @@ class TaskPlanner:
taskPlanningPromptTemplate = bundle.prompt
placeholders = bundle.placeholders
# Write task planning prompt to debug
from modules.shared.debugLogger import writeDebugFile
writeDebugFile(taskPlanningPromptTemplate, "taskplan_prompt", placeholders)
# Centralized AI call: Task planning (quality, detailed) with placeholders
options = AiCallOptions(
operationType=OperationType.GENERATE_PLAN,
@ -119,9 +115,6 @@ class TaskPlanner:
if not prompt:
raise ValueError("AI service returned no response for task planning")
# Write task planning response to debug
writeDebugFile(prompt or '', "taskplan_response")
# Parse task plan response
try:
jsonStart = prompt.find('{')

View file

@ -20,7 +20,6 @@ from modules.workflows.processing.shared.promptGenerationActionsReact import (
generateReactParametersPrompt,
generateReactRefinementPrompt
)
from modules.shared.debugLogger import writeDebugFile
from modules.workflows.processing.shared.placeholderFactory import extractReviewContent
from modules.workflows.processing.adaptive import IntentAnalyzer, ContentValidator, LearningEngine, ProgressTracker
from modules.workflows.processing.adaptive.adaptiveLearningEngine import AdaptiveLearningEngine
@ -191,10 +190,6 @@ class ReactMode(BaseMode):
promptTemplate = bundle.prompt
placeholders = bundle.placeholders
# Write action selection prompt to debug
from modules.shared.debugLogger import writeDebugFile
writeDebugFile(promptTemplate, "action_selection_prompt", placeholders)
# Centralized AI call for plan selection (use plan generation quality)
options = AiCallOptions(
operationType=OperationType.GENERATE_PLAN,
@ -211,8 +206,6 @@ class ReactMode(BaseMode):
placeholders=placeholders,
options=options
)
# Write action selection response to debug
writeDebugFile(response or '', "action_selection_response")
jsonStart = response.find('{') if response else -1
jsonEnd = response.rfind('}') + 1 if response else 0
if jsonStart == -1 or jsonEnd == 0:
@ -306,9 +299,6 @@ class ReactMode(BaseMode):
promptTemplate = bundle.prompt
placeholders = bundle.placeholders
# Write parameters prompt to debug
writeDebugFile(promptTemplate, "parameters_prompt", placeholders)
# Centralized AI call for parameter suggestion (balanced analysis)
options = AiCallOptions(
operationType=OperationType.ANALYSE_CONTENT,
@ -367,12 +357,11 @@ class ReactMode(BaseMode):
if 'language' not in parameters and hasattr(self.services, 'user') and getattr(self.services.user, 'language', None):
parameters['language'] = self.services.user.language
# Write parameters response to debug
# Build merged parameters object
mergedParamObj = {
"schema": (paramObj.get('schema') if isinstance(paramObj, dict) else 'parameters_v1'),
"parameters": parameters
}
writeDebugFile(str(mergedParamObj), "parameters_response", mergedParamObj)
# Build a synthetic ActionItem for execution routing and labels
currentRound = getattr(self.workflow, 'currentRound', 0)
@ -625,9 +614,6 @@ class ReactMode(BaseMode):
promptTemplate = bundle.prompt
placeholders = bundle.placeholders
# Write refinement/validation prompt to debug
writeDebugFile(promptTemplate, "validation_refinement_prompt", placeholders)
# Centralized AI call for refinement decision (balanced analysis)
options = AiCallOptions(
operationType=OperationType.ANALYSE_CONTENT,
@ -644,8 +630,6 @@ class ReactMode(BaseMode):
placeholders=placeholders,
options=options
)
# Write refinement/validation response to debug
writeDebugFile(resp or '', "validation_refinement_response")
# More robust JSON extraction
if not resp:

View file

@ -36,6 +36,9 @@ def generateActionDefinitionPrompt(services, context: Any) -> PromptBundle:
## 📋 Context
### User Language
{{KEY:USER_LANGUAGE}}
### Task Objective
{{KEY:USER_PROMPT}}
@ -45,9 +48,6 @@ def generateActionDefinitionPrompt(services, context: Any) -> PromptBundle:
### Available Connections
{{KEY:AVAILABLE_CONNECTIONS_INDEX}}
### User Language
{{KEY:USER_LANGUAGE}}
### Workflow History
{{KEY:WORKFLOW_HISTORY}}
@ -77,7 +77,7 @@ def generateActionDefinitionPrompt(services, context: Any) -> PromptBundle:
"parameters": {},
"resultLabel": "round{current_round}_task{current_task}_action{action_number}_{descriptive_label}",
"description": "What this action accomplishes",
"userMessage": "User-friendly message in {{KEY:USER_LANGUAGE}}"
"userMessage": "User-friendly message in language '{{KEY:USER_LANGUAGE}}'"
}
]
}
@ -118,7 +118,7 @@ def generateActionDefinitionPrompt(services, context: Any) -> PromptBundle:
- **Make labels meaningful** for future reference
### User Messages
- **Write in user language** ({{KEY:USER_LANGUAGE}})
- **Write in user language:** '{{KEY:USER_LANGUAGE}}'
- **Explain what's happening** in user-friendly terms
- **Keep messages concise** but informative
@ -171,7 +171,7 @@ def generateResultReviewPrompt(context: Any) -> PromptBundle:
"met_criteria": ["criteria1", "criteria2"],
"unmet_criteria": ["criteria3", "criteria4"],
"confidence": 0.85,
"userMessage": "User-friendly message explaining the validation result"
"userMessage": "User-friendly message explaining the validation result in language '{{KEY:USER_LANGUAGE}}'"
}
```

View file

@ -24,6 +24,7 @@ def generateReactPlanSelectionPrompt(services, context: Any, learningEngine=None
"""Define placeholders first, then the template; return PromptBundle."""
placeholders: List[PromptPlaceholder] = [
PromptPlaceholder(label="USER_PROMPT", content=extractUserPrompt(context), summaryAllowed=False),
PromptPlaceholder(label="USER_LANGUAGE", content=extractUserLanguage(services), summaryAllowed=False),
PromptPlaceholder(label="AVAILABLE_DOCUMENTS_SUMMARY", content=extractAvailableDocumentsSummary(services, context), summaryAllowed=True),
PromptPlaceholder(label="AVAILABLE_METHODS", content=extractAvailableMethods(services), summaryAllowed=False),
# Provide enriched history context for Stage 1 to craft parametersContext
@ -68,26 +69,20 @@ AVAILABLE_DOCUMENTS_INDEX:
AVAILABLE_CONNECTIONS_INDEX:
{{KEY:AVAILABLE_CONNECTIONS_INDEX}}
{{#if ADAPTIVE_GUIDANCE}}
LEARNING-BASED GUIDANCE:
{{KEY:ADAPTIVE_GUIDANCE}}
{{#if FAILURE_ANALYSIS}}
FAILURE ANALYSIS:
{{KEY:FAILURE_ANALYSIS}}
{{/if}}
ESCALATION LEVEL: {{KEY:ESCALATION_LEVEL}}
{{/if}}
REPLY: Return ONLY a JSON object with the following structure (no comments, no extra text). The chosen action MUST:
- be the next logical incremental step toward fulfilling the objective
- not attempt to complete the entire objective in one step
- if producing files, target exactly one output format for this step
- reference ONLY existing document IDs/labels from AVAILABLE_DOCUMENTS_INDEX
{{#if ADAPTIVE_GUIDANCE}}
- learn from previous validation feedback and avoid repeated mistakes
{{/if}}
{{
"action": "method.action_name",
"actionObjective": "...",
@ -112,10 +107,8 @@ RULES:
- Copy references EXACTLY as shown in AVAILABLE_DOCUMENTS_INDEX
6. For requiredConnection, use ONLY an exact label from AVAILABLE_CONNECTIONS_INDEX
7. Plan incrementally: if the overall intent needs multiple output formats (e.g., CSV and HTML), choose one format in this step and leave the other(s) for subsequent steps
{{#if ADAPTIVE_GUIDANCE}}
8. CRITICAL: Learn from previous validation feedback - avoid repeating the same mistakes
9. If previous attempts failed, consider alternative approaches or more specific parameters
{{/if}}
"""
return PromptBundle(prompt=template, placeholders=placeholders)
@ -197,6 +190,7 @@ Excludes documents/connections/history entirely.
placeholders: List[PromptPlaceholder] = [
PromptPlaceholder(label="ACTION_OBJECTIVE", content=actionObjective, summaryAllowed=False),
PromptPlaceholder(label="SELECTED_ACTION", content=compoundActionName, summaryAllowed=False),
PromptPlaceholder(label="USER_LANGUAGE", content=extractUserLanguage(services), summaryAllowed=False),
PromptPlaceholder(label="PARAMETERS_CONTEXT", content=(parametersContext or ""), summaryAllowed=True),
PromptPlaceholder(label="ACTION_PARAMETERS", content=actionParametersText, summaryAllowed=False),
PromptPlaceholder(label="LEARNINGS", content=learningsText, summaryAllowed=True),
@ -225,19 +219,13 @@ CONTEXT AND OBJECTIVE:
SELECTED_ACTION:
{{KEY:SELECTED_ACTION}}
{{#if PARAMETER_GUIDANCE}}
LEARNING-BASED PARAMETER GUIDANCE:
{{KEY:PARAMETER_GUIDANCE}}
{{#if ATTEMPT_NUMBER}}
ATTEMPT NUMBER: {{KEY:ATTEMPT_NUMBER}}
{{/if}}
{{#if FAILURE_ANALYSIS}}
PREVIOUS FAILURE ANALYSIS:
{{KEY:FAILURE_ANALYSIS}}
{{/if}}
{{/if}}
REPLY (ONLY JSON):
{{
@ -264,19 +252,15 @@ INSTRUCTIONS:
- Fill in appropriate values based on the context and objective
- Do NOT invent new parameters
- Do NOT include: documentList, connectionReference, history, documents, connections
{{#if PARAMETER_GUIDANCE}}
- CRITICAL: Follow the learning-based parameter guidance above
- Learn from previous validation failures and adjust parameters accordingly
{{/if}}
RULES:
- Return ONLY JSON (no markdown, no prose)
- Use ONLY the exact parameter names listed in REQUIRED PARAMETERS FOR THIS ACTION
- Do NOT add any parameters not listed above
- Do NOT add nested objects or custom fields
{{#if PARAMETER_GUIDANCE}}
- Apply learning insights to avoid repeated parameter mistakes
{{/if}}
"""
return PromptBundle(prompt=template, placeholders=placeholders)
@ -285,6 +269,7 @@ def generateReactRefinementPrompt(services, context: Any, reviewContent: str) ->
"""Define placeholders first, then the template; return PromptBundle."""
placeholders: List[PromptPlaceholder] = [
PromptPlaceholder(label="USER_PROMPT", content=extractUserPrompt(context), summaryAllowed=False),
PromptPlaceholder(label="USER_LANGUAGE", content=extractUserLanguage(services), summaryAllowed=False),
PromptPlaceholder(label="REVIEW_CONTENT", content=reviewContent, summaryAllowed=True),
]

View file

@ -75,7 +75,7 @@ Break down user requests into logical, executable task steps.
```json
{
"overview": "Brief description of the overall plan",
"userMessage": "User-friendly message explaining the task plan (use {{KEY:USER_LANGUAGE}} language)",
"userMessage": "User-friendly message explaining the task plan in language '{{KEY:USER_LANGUAGE}}'",
"tasks": [
{
"id": "task_1",
@ -83,7 +83,7 @@ Break down user requests into logical, executable task steps.
"dependencies": ["task_0"],
"success_criteria": ["measurable criteria 1", "measurable criteria 2"],
"estimated_complexity": "low|medium|high",
"userMessage": "What this task will accomplish"
"userMessage": "What this task will accomplish in language '{{KEY:USER_LANGUAGE}}'"
}
]
}

View file

@ -1,216 +0,0 @@
"""
Security utilities for AI prompt construction.
Provides secure content escaping to prevent prompt injection attacks.
"""
import re
import json
import logging
from typing import Any, Union, List, Dict
logger = logging.getLogger(__name__)
def _escapeForAiPrompt(content: str) -> str:
"""
Securely escape content for AI prompts to prevent injection attacks.
This function:
1. Escapes all special characters that could break prompt structure
2. Wraps content in secure delimiters
3. Handles multi-line content safely
4. Prevents quote injection and context breaking
Args:
content: The content to escape
Returns:
Safely escaped content wrapped in secure delimiters
"""
if not content:
return ""
# Convert to string if not already
content_str = str(content)
# Remove or escape dangerous characters that could break prompt structure
# This includes quotes, backslashes, and other special characters
escaped = content_str
# Escape backslashes first (order matters)
escaped = escaped.replace('\\', '\\\\')
# Escape quotes and other special characters
escaped = escaped.replace('"', '\\"')
escaped = escaped.replace("'", "\\'")
escaped = escaped.replace('\n', '\\n')
escaped = escaped.replace('\r', '\\r')
escaped = escaped.replace('\t', '\\t')
# Remove or escape other potentially dangerous characters
# Remove control characters except newlines (already handled above)
escaped = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', escaped)
# Wrap in secure delimiters with clear boundaries
# Using a unique delimiter pattern that's unlikely to appear in user content
secure_delimiter_start = "===USER_CONTENT_START==="
secure_delimiter_end = "===USER_CONTENT_END==="
return f"{secure_delimiter_start}\n{escaped}\n{secure_delimiter_end}"
def _escapeForJsonPrompt(content: Any) -> str:
"""
Securely escape content for JSON-based AI prompts.
Args:
content: The content to escape (can be any type)
Returns:
Safely escaped JSON string
"""
try:
# Convert to JSON string with proper escaping
json_str = json.dumps(content, ensure_ascii=False, separators=(',', ':'))
return json_str
except Exception as e:
logger.warning(f"Failed to escape content as JSON: {str(e)}")
# Fallback to string escaping
return _escapeForAiPrompt(str(content))
def _escapeForListPrompt(items: List[Any]) -> str:
"""
Securely escape a list of items for AI prompts.
Args:
items: List of items to escape
Returns:
Safely escaped list representation
"""
if not items:
return "[]"
try:
escaped_items = []
for item in items:
if isinstance(item, (dict, list)):
escaped_items.append(_escapeForJsonPrompt(item))
else:
escaped_items.append(_escapeForAiPrompt(str(item)))
return f"[{', '.join(escaped_items)}]"
except Exception as e:
logger.warning(f"Failed to escape list content: {str(e)}")
return "[]"
def securePromptContent(content: Any, content_type: str = "text") -> str:
"""
Main function to securely escape content for AI prompts.
Args:
content: The content to escape
content_type: Type of content ("text", "json", "list", "user_prompt", "document_content")
Returns:
Safely escaped content ready for AI prompt insertion
"""
if content is None:
return ""
try:
if content_type == "json":
return _escapeForJsonPrompt(content)
elif content_type == "list":
if isinstance(content, list):
return _escapeForListPrompt(content)
else:
return _escapeForAiPrompt(str(content))
elif content_type in ["user_prompt", "document_content"]:
# Extra security for user-controlled content
escaped = _escapeForAiPrompt(str(content))
# Add additional warning for AI
return f"⚠️ USER_CONTROLLED_CONTENT: {escaped}"
else: # content_type == "text" or default
return _escapeForAiPrompt(str(content))
except Exception as e:
logger.error(f"Error escaping content for AI prompt: {str(e)}")
# Return a safe fallback
return "[ERROR: Content could not be safely escaped]"
def buildSecurePrompt(template: str, **kwargs) -> str:
"""
Build a secure AI prompt by safely inserting content into a template.
Args:
template: The prompt template with {key} placeholders
**kwargs: Key-value pairs for template substitution
Returns:
Securely constructed prompt
"""
try:
# Escape all values before substitution
escaped_kwargs = {}
for key, value in kwargs.items():
if key.endswith('_json'):
escaped_kwargs[key] = securePromptContent(value, "json")
elif key.endswith('_list'):
escaped_kwargs[key] = securePromptContent(value, "list")
elif key in ['user_prompt', 'context', 'document_content', 'user_input']:
escaped_kwargs[key] = securePromptContent(value, "user_prompt")
else:
escaped_kwargs[key] = securePromptContent(value, "text")
# Use safe string formatting
return template.format(**escaped_kwargs)
except Exception as e:
logger.error(f"Error building secure prompt: {str(e)}")
return template # Return original template if escaping fails
def validatePromptSecurity(prompt: str) -> Dict[str, Any]:
"""
Validate that a prompt is secure and doesn't contain injection patterns.
Args:
prompt: The prompt to validate
Returns:
Dictionary with validation results
"""
issues = []
# Check for unescaped quotes that could break JSON
if '"' in prompt and '\\"' not in prompt:
# Check if quotes are properly escaped
unescaped_quotes = re.findall(r'(?<!\\)"', prompt)
if unescaped_quotes:
issues.append("Unescaped quotes detected")
# Check for potential injection patterns
injection_patterns = [
r'ignore\s+previous\s+instructions',
r'forget\s+everything',
r'you\s+are\s+now',
r'system\s*:',
r'assistant\s*:',
r'user\s*:',
r'<\|.*\|>', # Special tokens
]
for pattern in injection_patterns:
if re.search(pattern, prompt, re.IGNORECASE):
issues.append(f"Potential injection pattern detected: {pattern}")
# Check for proper content delimiters
if "===USER_CONTENT_START===" not in prompt and "===USER_CONTENT_END===" not in prompt:
# This might be okay for some prompts, but flag for review
if any(keyword in prompt.lower() for keyword in ['context', 'user', 'input', 'prompt']):
issues.append("User content may not be properly delimited")
return {
"is_secure": len(issues) == 0,
"issues": issues,
"prompt_length": len(prompt),
"has_user_content_delimiters": "===USER_CONTENT_START===" in prompt
}

View file

@ -216,7 +216,7 @@ class WorkflowManager:
" }\n"
" ]\n"
"}\n\n"
f"User message:\n{userInput.prompt}"
f"User message:\n{self.services.ai.sanitizePromptContent(userInput.prompt, 'userinput')}"
)
# Call AI analyzer
@ -716,6 +716,7 @@ class WorkflowManager:
logger.error(f"Error processing file ID {fileId}: {str(e)}")
return documents
def _setUserLanguage(self, language: str) -> None:
"""Set user language for the service center"""
self.services.user.language = language

View file

@ -0,0 +1,258 @@
import asyncio
import sys
import os
from unittest.mock import AsyncMock, MagicMock
# Add the project root to the sys.path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
from modules.datamodels.datamodelChat import ChatDocument
from modules.services.serviceAi.subCoreAi import SubCoreAi
class MockAiObjects:
def __init__(self, responses):
self.responses = responses
self.call_count = 0
async def call(self, request: AiCallRequest):
if self.call_count < len(self.responses):
response_content = self.responses[self.call_count]
self.call_count += 1
mock_response = MagicMock()
mock_response.content = response_content
mock_response.modelName = "mock-model"
mock_response.priceUsd = 0.001
mock_response.processingTime = 0.1
print(f" Mock AI Call {self.call_count}: Responding with partial result (length: {len(response_content)})")
return mock_response
else:
print(" Mock AI Call: No more mock responses, returning empty.")
mock_response = MagicMock()
mock_response.content = ""
return mock_response
class MockServices:
def __init__(self):
self.currentWorkflow = MagicMock()
self.currentWorkflow.id = "test_workflow_123"
self.workflow = MagicMock()
self.workflow.createProgressLogger.return_value = MagicMock()
self.workflow.storeWorkflowStat = AsyncMock()
self.ai = MagicMock()
self.ai.sanitizePromptContent.side_effect = lambda content, type: content
self.utils = MagicMock()
self.utils.debugLogToFile.side_effect = lambda msg, tag: print(f" DEBUG ({tag}): {msg}")
self.utils.configGet.return_value = False # Disable debug files for tests
class MockDocumentProcessor:
async def callAiText(self, prompt, documents, options):
return "Extracted content from documents: Sample text content"
async def test_unified_architecture():
print("\n=== Testing Unified Architecture ===")
# Mock responses: 1 for generation prompt building + 2 for actual generation
mock_responses = [
# Response 1: Generation prompt building
"Generate JSON content that creates a structured document with prime numbers in a table format. Use the canonical JSON format with sections and elements.",
# Response 2: First part of generation
"""{
"metadata": {
"title": "Prime Numbers List",
"splitStrategy": "single_document",
"source_documents": [],
"extraction_method": "ai_generation"
},
"documents": [
{
"id": "doc_primes_1_500",
"title": "Prime Numbers 1-500",
"filename": "primes_1_500.docx",
"sections": [
{
"id": "section_1",
"content_type": "table",
"elements": [
{
"headers": ["Number", "Prime"],
"rows": [
["1", "2"], ["2", "3"], ["3", "5"], ["4", "7"], ["5", "11"]
]
}
],
"order": 1
}
]
}
]
} [CONTINUE: Generate remaining prime numbers from 501 to 1000]""",
# Response 3: Second part of generation
"""{
"metadata": {
"title": "Prime Numbers List",
"splitStrategy": "single_document",
"source_documents": [],
"extraction_method": "ai_generation"
},
"documents": [
{
"id": "doc_primes_501_1000",
"title": "Prime Numbers 501-1000",
"filename": "primes_501_1000.docx",
"sections": [
{
"id": "section_2",
"content_type": "table",
"elements": [
{
"headers": ["Number", "Prime"],
"rows": [
["501", "3571"], ["502", "3572"], ["503", "3581"]
]
}
],
"order": 2
}
]
}
]
}"""
]
mock_ai_objects = MockAiObjects(mock_responses)
mock_services = MockServices()
mock_document_processor = MockDocumentProcessor()
core_ai_service = SubCoreAi(mock_services, mock_ai_objects)
prompt = "Generate the first 1000 prime numbers and arrange them in a structured table format."
options = AiCallOptions(operationType=OperationType.GENERATE_CONTENT)
output_format = "docx"
title = "Prime Numbers List"
print(f"User Prompt: '{prompt}'")
print("Testing unified architecture with direct generation (no documents)...")
# Test the unified generation method directly
result = await core_ai_service._callAiUnifiedGeneration(prompt, None, options, output_format, title)
print("\n--- Generated JSON Result ---")
print(f"Result length: {len(result)} characters")
print(f"Result preview: {result[:300]}...")
# Verify it's valid JSON
import json
try:
parsed_result = json.loads(result)
print(f"✅ Valid JSON with {len(parsed_result.get('documents', []))} documents")
# Verify it's using the multi-document format
if "documents" in parsed_result and "metadata" in parsed_result:
print("✅ Using unified multi-document format")
print("✅ Architecture is properly unified!")
return True
else:
print("❌ Not using multi-document format")
return False
except json.JSONDecodeError as e:
print(f"❌ Invalid JSON: {str(e)}")
return False
async def test_with_documents():
print("\n=== Testing Unified Architecture WITH Documents ===")
# Mock responses: 1 for generation prompt building + 1 for actual generation
mock_responses = [
# Response 1: Generation prompt building
"Generate JSON content that creates a comprehensive fruit analysis report based on the extracted content. Use the canonical JSON format with sections and elements.",
# Response 2: Generation with extracted content
"""{
"metadata": {
"title": "Fruit Analysis Report",
"splitStrategy": "single_document",
"source_documents": ["doc1"],
"extraction_method": "ai_generation"
},
"documents": [
{
"id": "doc_fruit_analysis",
"title": "Fruit Analysis Report",
"filename": "fruit_analysis.docx",
"sections": [
{
"id": "section_1",
"content_type": "paragraph",
"elements": [
{
"text": "Based on the extracted content, here is a comprehensive fruit analysis..."
}
],
"order": 1
}
]
}
]
}"""
]
mock_ai_objects = MockAiObjects(mock_responses)
mock_services = MockServices()
mock_document_processor = MockDocumentProcessor()
core_ai_service = SubCoreAi(mock_services, mock_ai_objects)
prompt = "Extract all fruit information and create a comprehensive analysis report."
options = AiCallOptions(operationType=OperationType.GENERATE_CONTENT)
output_format = "docx"
title = "Fruit Analysis Report"
print(f"User Prompt: '{prompt}'")
print("Testing unified architecture with document extraction...")
# Test the unified generation method with extracted content
result = await core_ai_service._callAiUnifiedGeneration(prompt, "Sample fruit data: apples, oranges, bananas", options, output_format, title)
print("\n--- Generated JSON Result ---")
print(f"Result length: {len(result)} characters")
print(f"Result preview: {result[:300]}...")
# Verify it's valid JSON
import json
try:
parsed_result = json.loads(result)
print(f"✅ Valid JSON with {len(parsed_result.get('documents', []))} documents")
# Verify it's using the multi-document format
if "documents" in parsed_result and "metadata" in parsed_result:
print("✅ Using unified multi-document format")
print("✅ Architecture is properly unified!")
return True
else:
print("❌ Not using multi-document format")
return False
except json.JSONDecodeError as e:
print(f"❌ Invalid JSON: {str(e)}")
return False
async def main():
print("🚀 Testing Unified Architecture Implementation")
print("=" * 60)
success1 = await test_unified_architecture()
success2 = await test_with_documents()
if success1 and success2:
print("\n🎉 ALL TESTS PASSED! Unified architecture is properly implemented.")
print("✅ Single document = multi-document with n=1")
print("✅ Always uses multi-document JSON format")
print("✅ Continuation logic works for long responses")
print("✅ Both scenarios (with/without documents) work")
else:
print("\n❌ Some tests failed. Please check the implementation.")
if __name__ == "__main__":
asyncio.run(main())