Prompt tuning for generation and validation step

This commit is contained in:
ValueOn AG 2025-10-31 14:28:14 +01:00
parent a7c9e0221f
commit 259ccabbe3
18 changed files with 821 additions and 1259 deletions

View file

@ -1,242 +0,0 @@
"""
Script to analyze codebase for snake_case naming violations that should be camelStyle.
Excludes routes (decorated endpoint functions) and JSON field names.
"""
import ast
import os
import re
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple
import csv
# Patterns to exclude (external library interfaces, etc.)
EXCLUDE_PATTERNS = [
r'@.*\.(get|post|put|delete|patch|options|head)', # FastAPI route decorators
r'self\.(db|db_|model|orm)', # Database ORM attributes
r'\.(objects|query|filter|get|all)', # ORM methods
r'(request|response|response_model|status_code)', # FastAPI params
r'(snake_case|kebab-case)', # String literals
]
# External library attribute patterns (should not be changed)
EXTERNAL_LIB_ATTRIBUTES = {
'pydantic', 'fastapi', 'sqlalchemy', 'psycopg', 'requests',
'aiohttp', 'azure', 'google', 'openai', 'anthropic', 'reportlab',
'docx', 'pptx', 'openpyxl', 'json', 'logging', 'datetime', 'typing'
}
def isRouteFile(filePath: str) -> bool:
"""Check if file is a route file"""
return 'routes' in filePath or 'route' in os.path.basename(filePath).lower()
def shouldExcludeName(name: str, context: str = "") -> bool:
"""Check if a name should be excluded from analysis"""
# Skip if it's a builtin or external library attribute
if name.startswith('__') and name.endswith('__'):
return True
# Skip if context suggests external library usage
for pattern in EXCLUDE_PATTERNS:
if re.search(pattern, context, re.IGNORECASE):
return True
return False
def isSnakeCase(name: str) -> bool:
"""Check if a name is snake_case"""
if not name or name.startswith('_'):
return False
# Check if contains underscore and is not all caps
return '_' in name and not name.isupper()
def analyzeFile(filePath: str) -> Dict[str, List[str]]:
"""Analyze a Python file for naming violations"""
violations = {
'functions': [],
'parameters': [],
'variables': []
}
try:
with open(filePath, 'r', encoding='utf-8') as f:
content = f.read()
tree = ast.parse(content, filename=filePath)
except (SyntaxError, UnicodeDecodeError):
return violations
# Track current context
currentClass = None
inRouteDecorator = False
class NamingAnalyzer(ast.NodeVisitor):
def __init__(self):
self.violations = violations
self.currentClass = None
self.inRouteDecorator = False
self.functionDefs = []
def visit_FunctionDef(self, node):
# Check if this is a route endpoint (has FastAPI decorator)
isRouteEndpoint = False
for decorator in node.decorator_list:
if isinstance(decorator, ast.Attribute):
if decorator.attr in ['get', 'post', 'put', 'delete', 'patch', 'options', 'head']:
isRouteEndpoint = True
break
elif isinstance(decorator, ast.Call):
if isinstance(decorator.func, ast.Attribute):
if decorator.func.attr in ['get', 'post', 'put', 'delete', 'patch', 'options', 'head']:
isRouteEndpoint = True
break
# Skip route endpoint function names
# But we still need to check their parameters and variables
funcName = node.name
if not isRouteEndpoint and isSnakeCase(funcName) and not shouldExcludeName(funcName):
self.violations['functions'].append(f"{funcName} (line {node.lineno})")
# Analyze parameters
for arg in node.args.args:
if arg.arg != 'self' and arg.arg != 'cls':
paramName = arg.arg
if isSnakeCase(paramName) and not shouldExcludeName(paramName):
self.violations['parameters'].append(f"{paramName} in {funcName} (line {node.lineno})")
# Analyze function body for local variables
for stmt in node.body:
self.visit(stmt)
def visit_ClassDef(self, node):
oldClass = self.currentClass
self.currentClass = node.name
self.generic_visit(node)
self.currentClass = oldClass
def visit_Assign(self, node):
for target in node.targets:
if isinstance(target, ast.Name):
varName = target.id
# Skip constants (ALL_CAPS), builtins, and private (_xxx)
if varName.isupper() or varName.startswith('_'):
continue
# Local variables should be camelStyle
if isSnakeCase(varName) and not shouldExcludeName(varName):
self.violations['variables'].append(f"{varName} (line {node.lineno})")
def visit_For(self, node):
if isinstance(node.target, ast.Name):
varName = node.target.id
if isSnakeCase(varName) and not shouldExcludeName(varName):
self.violations['variables'].append(f"{varName} (line {node.lineno})")
self.generic_visit(node)
def visit_With(self, node):
if node.items:
for item in node.items:
if item.optional_vars:
if isinstance(item.optional_vars, ast.Name):
varName = item.optional_vars.id
if isSnakeCase(varName) and not shouldExcludeName(varName):
self.violations['variables'].append(f"{varName} (line {node.lineno})")
self.generic_visit(node)
analyzer = NamingAnalyzer()
analyzer.visit(tree)
return violations
def analyzeCodebase(rootDir: str = 'gateway') -> Dict[str, Dict[str, int]]:
"""Analyze entire codebase"""
results = defaultdict(lambda: {
'functions': 0,
'parameters': 0,
'variables': 0,
'details': {
'functions': [],
'parameters': [],
'variables': []
}
})
# Handle both absolute and relative paths
rootPath = Path(rootDir)
if not rootPath.exists():
# Try relative to current directory
rootPath = Path('.').resolve() / rootDir
if not rootPath.exists():
# Try just current directory if we're already in gateway
rootPath = Path('.')
# Find all Python files
for pyFile in rootPath.rglob('*.py'):
# Skip route files for function name analysis (but analyze their internals)
filePath = str(pyFile.relative_to(rootPath))
# Skip test files and special scripts
if 'test' in filePath.lower() or 'tool_' in filePath or '__pycache__' in filePath:
continue
violations = analyzeFile(str(pyFile))
# Check if there are any violations
totalViolations = len(violations['functions']) + len(violations['parameters']) + len(violations['variables'])
if totalViolations > 0:
moduleName = filePath.replace('\\', '/')
results[moduleName]['functions'] = len(violations['functions'])
results[moduleName]['parameters'] = len(violations['parameters'])
results[moduleName]['variables'] = len(violations['variables'])
results[moduleName]['details'] = violations
return results
def generateCSV(results: Dict[str, Dict[str, int]], outputFile: str = 'naming_violations.csv'):
"""Generate CSV report"""
with open(outputFile, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Module', 'Function Names', 'Parameter Names', 'Variable Names', 'Total'])
# Sort by total violations
sortedResults = sorted(
results.items(),
key=lambda x: x[1]['functions'] + x[1]['parameters'] + x[1]['variables'],
reverse=True
)
rowsWritten = 0
for module, stats in sortedResults:
total = stats['functions'] + stats['parameters'] + stats['variables']
if total > 0:
writer.writerow([
module,
stats['functions'],
stats['parameters'],
stats['variables'],
total
])
rowsWritten += 1
if rowsWritten == 0:
print("WARNING: No rows written to CSV despite finding violations!")
print(f"CSV report generated: {outputFile}")
print(f"Total modules analyzed: {len(results)}")
# Print summary
totalFuncs = sum(r['functions'] for r in results.values())
totalParams = sum(r['parameters'] for r in results.values())
totalVars = sum(r['variables'] for r in results.values())
print(f"\nSummary:")
print(f" Function names: {totalFuncs}")
print(f" Parameter names: {totalParams}")
print(f" Variable names: {totalVars}")
print(f" Total violations: {totalFuncs + totalParams + totalVars}")
if __name__ == '__main__':
print("Analyzing codebase for naming violations...")
results = analyzeCodebase('gateway')
# Write CSV to gateway directory
outputPath = Path('gateway') / 'naming_violations_report.csv'
generateCSV(results, str(outputPath))

View file

@ -1,254 +0,0 @@
# Complete Function Call Diagram
```mermaid
graph TB
subgraph AI_Service["AI Service Modules"]
MA[mainServiceAi<br/>AiService]
SC[subCoreAi<br/>SubCoreAi]
SDG[subDocumentGeneration<br/>SubDocumentGeneration]
SDP[subDocumentProcessing<br/>SubDocumentProcessing]
SU[subSharedAiUtils<br/>Utilities]
end
subgraph EXT_Service["Extraction Service Modules"]
MSE[mainServiceExtraction<br/>ExtractionService]
SPE[subPromptBuilderExtraction<br/>buildExtractionPrompt]
SP[subPipeline<br/>runExtraction]
end
subgraph GEN_Service["Generation Service Modules"]
MSG[mainServiceGeneration<br/>GenerationService]
SPG[subPromptBuilderGeneration<br/>buildGenerationPrompt]
SJ[subJsonSchema<br/>Schemas]
end
%% subCoreAi calls
SC -->|_buildGenerationPrompt| SPG
SC -->|callAiDocuments| SDP
SC -->|sanitizePromptContent| SU
%% subDocumentGeneration calls
SDG -->|processDocumentsWithContinuation| SDP
SDG -->|buildGenerationPrompt| SPG
SDG -->|renderReport| MSG
SDG -->|sanitizePromptContent| SU
%% subDocumentProcessing calls
SDP -->|extractContent 3x| MSE
SDP -->|_applyMerging 3x| SP
SDP -->|readImage| SC
%% mainServiceExtraction calls
MSE -->|runExtraction| SP
%% subPromptBuilderExtraction calls
SPE -->|get_document_subJsonSchema| SJ
SPE -->|sanitizePromptContent| SU
%% mainServiceGeneration calls utilities
MSG -->|utility functions| SU
%% subCoreAi detailed calls
SC -.->|aiObjects.call| AI_Interface["AiObjects Interface"]
SDP -.->|aiObjects.call| AI_Interface
%% Style
classDef aiClass fill:#e1f5ff,stroke:#0066cc,stroke-width:2px
classDef extClass fill:#fff5e1,stroke:#cc6600,stroke-width:2px
classDef genClass fill:#e1ffe1,stroke:#006600,stroke-width:2px
classDef utilClass fill:#f0f0f0,stroke:#666,stroke-width:2px
classDef interfaceClass fill:#ffe1f5,stroke:#cc0066,stroke-width:2px
class MA,SC,SDG,SDP,SU aiClass
class MSE,SPE,SP extClass
class MSG,SPG,SJ genClass
class AI_Interface interfaceClass
```
## Detailed Call Map with Function Names
```mermaid
graph LR
%% Nodes
SC[subCoreAi]
SDG[subDocumentGeneration]
SDP[subDocumentProcessing]
SU[subSharedAiUtils]
SPE[subPromptBuilderExtraction]
SPG[subPromptBuilderGeneration]
MSE[mainServiceExtraction]
MSG[mainServiceGeneration]
SP[subPipeline]
SJ[subJsonSchema]
%% subCoreAi function calls
SC -->|"_buildGenerationPrompt()<br/>calls"| SPG
SC -->|"callAiDocuments()<br/>calls callAiText()"| SDP
SC -->|"sanitizePromptContent()"| SU
%% subDocumentGeneration function calls
SDG -->|"_processDocumentsUnified()<br/>calls"| SDP
SDG -->|"_processDocument()<br/>calls"| SPG
SDG -->|"_processDocument()<br/>calls"| MSG
SDG -->|"sanitizePromptContent()"| SU
%% subDocumentProcessing function calls
SDP -->|"extractContent()"| MSE
SDP -->|"_mergePartResults()<br/>_convertPartResultsToJson()<br/>_mergeChunkResultsJson()<br/>all call"| SP
SDP -->|"_processChunksWithMapping()<br/>calls readImage()"| SC
%% Extraction service calls
MSE -->|"extractContent()<br/>calls"| SP
%% Prompt builder calls
SPE -->|"get_document_subJsonSchema()"| SJ
SPE -->|"sanitizePromptContent()"| SU
%% Generation service calls
MSG -->|"uses utility functions"| SU
classDef aiModule fill:#e1f5ff,stroke:#0066cc
classDef extModule fill:#fff5e1,stroke:#cc6600
classDef genModule fill:#e1ffe1,stroke:#006600
class SC,SDG,SDP,SU aiModule
class MSE,SPE,SP extModule
class MSG,SPG,SJ genModule
```
## Call Flow by Module
### 1. subCoreAi (SubCoreAi Class)
**Calls Out:**
- `buildGenerationPrompt()` → subPromptBuilderGeneration (line 363-366)
- `callAiText()` → subDocumentProcessing (line 453)
- `renderReport()` → mainServiceGeneration (line 478-482)
- `sanitizePromptContent()` → subSharedAiUtils (line 61, via services.ai)
**Called By:**
- mainServiceAi (creates instance)
- subDocumentProcessing._processChunksWithMapping (calls readImage at line 672-675)
---
### 2. subDocumentGeneration (SubDocumentGeneration Class)
**Calls Out:**
- `processDocumentsWithContinuation()` → subDocumentProcessing (line 110)
- `buildGenerationPrompt()` → subPromptBuilderGeneration (line 330)
- `renderReport()` → mainServiceGeneration (line 392)
- `sanitizePromptContent()` → subSharedAiUtils (line 466)
**Called By:**
- mainServiceAi (creates instance)
---
### 3. subDocumentProcessing (SubDocumentProcessing Class)
**Calls Out:**
- `extractContent()` → mainServiceExtraction (lines 78, 131, 220)
- `_applyMerging()` → subPipeline (lines 1044, 1095, 1232, 1293, 1345)
- `readImage()` → subCoreAi (line 672-675)
- `sanitizePromptContent()` → subSharedAiUtils (via self.services.ai)
**Called By:**
- mainServiceAi (creates instance)
- subCoreAi.callAiDocuments (calls callAiText at line 453)
- subDocumentGeneration._processDocumentsUnified (calls processDocumentsWithContinuation)
---
### 4. mainServiceExtraction (ExtractionService Class)
**Calls Out:**
- `runExtraction()` → subPipeline (line 61)
- Uses ExtractorRegistry from subRegistry
**Called By:**
- subDocumentProcessing.extractContent (3 times)
---
### 5. subPromptBuilderExtraction
**Calls Out:**
- `get_document_subJsonSchema()` → subJsonSchema (line 172)
- `sanitizePromptContent()` → subSharedAiUtils (via services.ai)
**Called By:**
- mainServiceGeneration (indirectly via getAdaptiveExtractionPrompt)
---
### 6. mainServiceGeneration (GenerationService Class)
**Calls Out:**
- `get_renderer()` → renderers.registry (line 501)
- Utility functions from subDocumentUtility
- Uses modelRegistry (external)
**Called By:**
- subCoreAi.callAiDocuments (calls renderReport)
- subDocumentGeneration._processDocument (calls renderReport)
---
### 7. subPromptBuilderGeneration
**Calls Out:**
- Returns prompt template string
**Called By:**
- subCoreAi._buildGenerationPrompt (line 363-366)
- subDocumentGeneration._processDocument (line 330)
---
### 8. subPipeline
**Calls Out:**
- Creates IntelligentTokenAwareMerger from subMerger (line 96)
- Uses mergers from merging submodules
**Called By:**
- mainServiceExtraction.extractContent (calls runExtraction)
- subDocumentProcessing (calls _applyMerging 5 times)
---
### 9. subSharedAiUtils
**Functions Provided:**
- `buildPromptWithPlaceholders()`
- `sanitizePromptContent()`
- `extractTextFromContentParts()`
- `reduceText()`
- `determineCallType()`
**Called By:**
- subCoreAi (imports and calls functions)
- subDocumentGeneration (via services.ai.sanitizePromptContent)
- subPromptBuilderExtraction (via services.ai.sanitizePromptContent)
---
### 10. subJsonSchema
**Functions Provided:**
- `get_document_subJsonSchema()`
- `get_multi_document_subJsonSchema()`
**Called By:**
- subPromptBuilderExtraction.buildExtractionPrompt (line 172)
---
## Circular Dependencies
**AI Service Loop:**
1. subDocumentProcessing → subCoreAi.readImage() (for image processing)
2. subDocumentProcessing → mainServiceExtraction (for extraction)
3. mainServiceExtraction → subPipeline (for processing)
4. subPipeline creates IntelligentTokenAwareMerger
**Flow:**
```
subDocumentProcessing.extractContent()
→ mainServiceExtraction.extractContent()
→ subPipeline.runExtraction()
→ returns ContentExtracted
→ processed by subDocumentProcessing
→ calls subPipeline._applyMerging()
```

View file

@ -70,6 +70,12 @@ class ModelSelector:
promptSize = len(prompt.encode("utf-8"))
contextSize = len(context.encode("utf-8"))
totalSize = promptSize + contextSize
# Convert bytes to approximate tokens (1 token ≈ 4 bytes)
promptTokens = promptSize / 4
contextTokens = contextSize / 4
totalTokens = totalSize / 4
logger.debug(f"Request sizes - Prompt: {promptTokens:.0f} tokens ({promptSize} bytes), Context: {contextTokens:.0f} tokens ({contextSize} bytes), Total: {totalTokens:.0f} tokens ({totalSize} bytes)")
# Step 1: Filter by operation type (MUST match) - check if model has this operation type
operationFiltered = []
@ -80,10 +86,32 @@ class ModelSelector:
operationFiltered.append(model)
logger.debug(f"After operation type filtering: {len(operationFiltered)} models")
if operationFiltered:
logger.debug(f"Models with {options.operationType.value}: {[m.name for m in operationFiltered]}")
# Step 2: Filter by prompt size (MUST be <= 80% of context size)
promptFiltered = [m for m in operationFiltered if m.contextLength == 0 or promptSize <= m.contextLength * 0.8]
# Note: contextLength is in tokens, so we need to compare tokens with tokens
promptFiltered = []
for model in operationFiltered:
if model.contextLength == 0:
# No context length limit - always pass
promptFiltered.append(model)
else:
maxAllowedTokens = model.contextLength * 0.8
# Compare prompt tokens (not bytes) with model's token limit
if promptTokens <= maxAllowedTokens:
promptFiltered.append(model)
else:
logger.debug(f"Model {model.name} filtered out: promptSize={promptTokens:.0f} tokens > maxAllowed={maxAllowedTokens:.0f} tokens (80% of {model.contextLength} tokens)")
logger.debug(f"After prompt size filtering: {len(promptFiltered)} models")
if not promptFiltered and operationFiltered:
logger.warning(f"All {len(operationFiltered)} models with {options.operationType.value} were filtered out due to prompt size. Prompt: {promptTokens:.0f} tokens. Available models:")
for model in operationFiltered:
maxAllowed = model.contextLength * 0.8 / 4 if model.contextLength > 0 else "unlimited"
logger.warning(f" - {model.name}: contextLength={model.contextLength} tokens, maxAllowed={maxAllowed} tokens")
# Step 3: Calculate scores for each model
scoredModels = []
for model in promptFiltered:

View file

@ -88,4 +88,4 @@ class ExtractionOptions(BaseModel):
maxConcurrentChunks: int = Field(default=5, ge=1, le=20, description="Maximum number of chunks to process concurrently")
class Config:
arbitraryTypesAllowed = True # Allow OperationTypeEnum import
arbitraryTypesAllowed = True # Allow OperationTypeEnum import

View file

@ -19,7 +19,7 @@ from modules.datamodels.datamodelAi import (
AiModelCall,
AiModelResponse,
)
from modules.datamodels.datamodelExtraction import ContentPart
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
# Dynamic model registry - models are now loaded from connectors via aicore system
@ -73,7 +73,8 @@ class AiObjects:
logger.info(f"Selected model: {selectedModel.name} ({selectedModel.displayName})")
return selectedModel.name
# AI for Extraction and Text Generation
# AI for Extraction, Processing, Generation
async def call(self, request: AiCallRequest) -> AiCallResponse:
"""Call AI model for text generation with model-aware chunking."""
# Handle content parts (unified path)
@ -366,7 +367,6 @@ class AiObjects:
content_parts.append(content_part)
# Use existing merging system
from modules.datamodels.datamodelExtraction import MergeStrategy
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="typeGroup",
@ -374,8 +374,7 @@ class AiObjects:
mergeType="concatenate"
)
from modules.services.serviceExtraction.subPipeline import _applyMerging
merged_parts = _applyMerging(content_parts, merge_strategy)
merged_parts = applyMerging(content_parts, merge_strategy)
# Convert merged parts back to final string
final_content = "\n\n".join([part.data for part in merged_parts])
@ -413,7 +412,6 @@ class AiObjects:
content_parts.append(content_part)
# Use existing merging system
from modules.datamodels.datamodelExtraction import MergeStrategy
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="typeGroup",
@ -421,8 +419,7 @@ class AiObjects:
mergeType="concatenate"
)
from modules.services.serviceExtraction.subPipeline import _applyMerging
merged_parts = _applyMerging(content_parts, merge_strategy)
merged_parts = applyMerging(content_parts, merge_strategy)
# Convert merged parts back to final string
final_content = "\n\n".join([part.data for part in merged_parts])
@ -505,82 +502,6 @@ class AiObjects:
)
# AI for Image Generation
async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> AiCallResponse:
"""Generate an image using AI."""
if options is None:
options = AiCallOptions(operationType=OperationTypeEnum.IMAGE_GENERATE)
# Calculate input bytes
inputBytes = len(prompt.encode("utf-8"))
try:
# Select the best model for image generation
modelName = self._selectModel(prompt, "", options)
selectedModel = modelRegistry.getModel(modelName)
if not selectedModel:
raise ValueError(f"Selected model {modelName} not found in registry")
# Get the connector for this model
connector = modelRegistry.getConnectorForModel(modelName)
if not connector:
raise ValueError(f"No connector found for model {modelName}")
# Start timing
startTime = time.time()
# Create standardized call object for image generation
modelCall = AiModelCall(
messages=[{"role": "user", "content": prompt}],
model=selectedModel,
options=AiCallOptions(size=size, quality=quality, style=style)
)
# Call the model with standardized interface
if selectedModel.functionCall:
modelResponse = await selectedModel.functionCall(modelCall)
# Extract content from standardized response
if not modelResponse.success:
raise ValueError(f"Model call failed: {modelResponse.error}")
content = modelResponse.content
else:
raise ValueError(f"Model {modelName} has no function call defined")
# Calculate timing and output bytes
endTime = time.time()
processingTime = endTime - startTime
outputBytes = len(content.encode("utf-8"))
# Calculate price using model's own price calculation method
priceUsd = selectedModel.calculatePriceUsd(processingTime, inputBytes, outputBytes)
logger.info(f"✅ Image generation successful with model: {modelName}")
return AiCallResponse(
success=True,
content=content,
modelName=modelName,
processingTime=processingTime,
priceUsd=priceUsd,
bytesSent=inputBytes,
bytesReceived=outputBytes,
errorCount=0
)
except Exception as e:
logger.error(f"❌ Image generation failed with model {modelName}: {str(e)}")
return AiCallResponse(
content=f"Image generation failed: {str(e)}",
modelName=modelName,
priceUsd=0.0,
processingTime=0.0,
bytesSent=inputBytes,
bytesReceived=0,
errorCount=1
)
# Utility methods
async def listAvailableModels(self, connectorType: str = None) -> List[Dict[str, Any]]:
"""List available models, optionally filtered by connector type."""
@ -601,3 +522,65 @@ class AiObjects:
models = modelRegistry.getModelsByTag(tag)
return [model.name for model in models]
def applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""Apply merging strategy to parts with intelligent token-aware merging."""
logger.debug(f"applyMerging called with {len(parts)} parts")
# Import merging dependencies
from modules.services.serviceExtraction.merging.mergerText import TextMerger
from modules.services.serviceExtraction.merging.mergerTable import TableMerger
from modules.services.serviceExtraction.merging.mergerDefault import DefaultMerger
from modules.services.serviceExtraction.subMerger import IntelligentTokenAwareMerger
# Check if intelligent merging is enabled
if strategy.useIntelligentMerging:
modelCapabilities = strategy.capabilities or {}
subMerger = IntelligentTokenAwareMerger(modelCapabilities)
# Use intelligent merging for all parts
merged = subMerger.mergeChunksIntelligently(parts, strategy.prompt or "")
# Calculate and log optimization stats
stats = subMerger.calculateOptimizationStats(parts, merged)
logger.info(f"🧠 Intelligent merging stats: {stats}")
logger.debug(f"Intelligent merging: {stats['original_ai_calls']}{stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
return merged
# Fallback to traditional merging
textMerger = TextMerger()
tableMerger = TableMerger()
defaultMerger = DefaultMerger()
# Group by typeGroup
textParts = [p for p in parts if p.typeGroup == "text"]
tableParts = [p for p in parts if p.typeGroup == "table"]
structureParts = [p for p in parts if p.typeGroup == "structure"]
otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
merged: List[ContentPart] = []
if textParts:
textMerged = textMerger.merge(textParts, strategy)
logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
merged.extend(textMerged)
if tableParts:
tableMerged = tableMerger.merge(tableParts, strategy)
logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
merged.extend(tableMerged)
if structureParts:
# For now, treat structure like text
structureMerged = textMerger.merge(structureParts, strategy)
logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
merged.extend(structureMerged)
if otherParts:
otherMerged = defaultMerger.merge(otherParts, strategy)
logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
merged.extend(otherMerged)
logger.debug(f"applyMerging returning {len(merged)} parts")
return merged

View file

@ -302,6 +302,7 @@ Respond with ONLY a JSON object in this exact format:
# Done - build final result
if operationId:
self.services.workflow.progressLogUpdate(operationId, 0.95, f"Generation complete ({iteration} iterations, {len(allSections)} sections)")
logger.info(f"Generation complete after {iteration} iterations: {len(allSections)} sections")
break
except Exception as e:
@ -385,7 +386,7 @@ Respond with ONLY a JSON object in this exact format:
rawResponse: str = None
) -> bool:
"""
Determine if generation should continue based on JSON completeness and complete_response flag.
Determine if generation should continue based on JSON completeness, complete_response flag, and task completion.
Returns True if we should continue, False if done.
"""
if len(allSections) == 0:
@ -395,13 +396,22 @@ Respond with ONLY a JSON object in this exact format:
if rawResponse:
import re
if re.search(r'"complete_response"\s*:\s*true', rawResponse, re.IGNORECASE):
logger.info(f"Iteration {iteration}: AI marked response as complete (complete_response flag detected)")
return False
# If JSON was complete (and no complete_response flag), we're done
# If JSON was broken and repaired, continue to get more content
# If JSON was complete, stop (AI should have set complete_response if task is done)
# For continuation iterations (iteration > 1), if JSON is complete but no flag was set,
# stop to prevent infinite loops - AI had a chance to set the flag
if wasJsonComplete:
if iteration > 1:
# Continuation mode: JSON complete without flag means we're likely done
# Stop to prevent infinite loops
logger.info(f"Iteration {iteration}: JSON complete without complete_response flag - stopping")
return False
# First iteration with complete JSON - done
return False
else:
# JSON was incomplete/broken - continue
return True
def _buildFinalResultFromSections(
@ -523,25 +533,61 @@ Respond with ONLY a JSON object in this exact format:
)
try:
# Ensure AI connectors are initialized before delegating to generator
if hasattr(self.services, 'ai') and hasattr(self.services.ai, '_ensureAiObjectsInitialized'):
await self.services.ai._ensureAiObjectsInitialized()
if options is None or (hasattr(options, 'operationType') and options.operationType is None):
# Use AI to determine parameters ONLY when truly needed (options=None OR operationType=None)
self.services.workflow.progressLogUpdate(aiOperationId, 0.1, "Analyzing prompt parameters")
options = await self._analyzePromptAndCreateOptions(prompt)
# Route image-generation requests directly to image pipeline to avoid JSON loop
imgFormats = {"png", "jpg", "jpeg", "webp", "image", "base64"}
# Handle image generation requests directly via generic path
opType = getattr(options, "operationType", None)
fmt = (outputFormat or "").lower() if outputFormat else None
isImageRequest = (opType == OperationTypeEnum.IMAGE_GENERATE) or (fmt in imgFormats)
isImageRequest = (opType == OperationTypeEnum.IMAGE_GENERATE)
if isImageRequest:
# Image generation uses generic call path but bypasses document generation pipeline
self.services.workflow.progressLogUpdate(aiOperationId, 0.4, "Calling AI for image generation")
imageResponse = await self.generateImage(prompt, options=options)
self.services.workflow.progressLogUpdate(aiOperationId, 0.9, "Image generated")
self.services.workflow.progressLogFinish(aiOperationId, True)
return imageResponse
# Call via generic path (no looping for images)
request = AiCallRequest(
prompt=prompt,
context="",
options=options
)
response = await self.aiObjects.call(request)
# Extract image data from response
if response.content:
# For base64 format, return in expected format
if outputFormat == "base64":
result = {
"success": True,
"image_data": response.content,
"documents": [{
"documentName": "generated_image.png",
"documentData": response.content,
"mimeType": "image/png",
"title": title or "Generated Image"
}]
}
else:
# Return raw content for other formats
result = response.content
# Emit stats for image generation
self.services.workflow.storeWorkflowStat(
self.services.currentWorkflow,
response,
f"ai.generate.image"
)
self.services.workflow.progressLogUpdate(aiOperationId, 0.9, "Image generated")
self.services.workflow.progressLogFinish(aiOperationId, True)
return result
else:
errorMsg = f"No image data returned: {response.content}"
logger.error(f"Error in AI image generation: {errorMsg}")
self.services.workflow.progressLogFinish(aiOperationId, False)
return {"success": False, "error": errorMsg}
# CRITICAL: For document generation with JSON templates, NEVER compress the prompt
# Compressing would truncate the template structure and confuse the AI
@ -656,102 +702,6 @@ Respond with ONLY a JSON object in this exact format:
self.services.workflow.progressLogFinish(aiOperationId, False)
raise
# AI Image Analysis
async def readImage(
self,
prompt: str,
imageData: Union[str, bytes],
mimeType: str = None,
options: Optional[AiCallOptions] = None,
) -> str:
"""Call AI for image analysis using interface.call() with contentParts."""
await self._ensureAiObjectsInitialized()
try:
# Check if imageData is valid
if not imageData:
error_msg = "No image data provided"
logger.error(f"Error in AI image analysis: {error_msg}")
return f"Error: {error_msg}"
# Always use IMAGE_ANALYSE operation type for image processing
if options is None:
options = AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE)
else:
# Override the operation type to ensure image analysis
options.operationType = OperationTypeEnum.IMAGE_ANALYSE
# Create content parts with image data
from modules.datamodels.datamodelExtraction import ContentPart
import base64
# ContentPart.data must be a string - convert bytes to base64 if needed
if isinstance(imageData, bytes):
imageDataStr = base64.b64encode(imageData).decode('utf-8')
else:
# Already a base64 string
imageDataStr = imageData
imagePart = ContentPart(
id="image_0",
parentId=None,
label="Image",
typeGroup="image",
mimeType=mimeType or "image/jpeg",
data=imageDataStr, # Must be a string (base64 encoded)
metadata={"imageAnalysis": True}
)
# Create request with content parts
request = AiCallRequest(
prompt=prompt,
context="",
options=options,
contentParts=[imagePart]
)
response = await self.aiObjects.call(request)
result = response.content
# Check if result is valid
if not result or (isinstance(result, str) and not result.strip()):
error_msg = f"No response from AI image analysis (result: {repr(result)})"
logger.error(f"Error in AI image analysis: {error_msg}")
return f"Error: {error_msg}"
return result
except Exception as e:
logger.error(f"Error in AI image analysis: {str(e)}")
return f"Error: {str(e)}"
# AI Image Generation
async def generateImage(
self,
prompt: str,
size: str = "1024x1024",
quality: str = "standard",
style: str = "vivid",
options: Optional[AiCallOptions] = None,
) -> Dict[str, Any]:
"""Generate an image using AI using interface.generateImage()."""
await self._ensureAiObjectsInitialized()
try:
response = await self.aiObjects.generateImage(prompt, size, quality, style, options)
# Emit stats for image generation
self.services.workflow.storeWorkflowStat(
self.services.currentWorkflow,
response,
f"ai.generate.image"
)
return response
except Exception as e:
logger.error(f"Error in AI image generation: {str(e)}")
return {"success": False, "error": str(e)}
async def callAiText(
self,
prompt: str,

View file

@ -5,7 +5,7 @@ import time
import asyncio
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .subPipeline import runExtraction, _applyMerging
from .subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions, PartResult
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallResponse, AiCallRequest, AiCallOptions, OperationTypeEnum
@ -14,6 +14,9 @@ from modules.aicore.aicoreModelRegistry import modelRegistry
logger = logging.getLogger(__name__)
# Rebuild ExtractionOptions to resolve forward references after all imports are complete
ExtractionOptions.model_rebuild()
class ExtractionService:
def __init__(self, services: Optional[Any] = None):
@ -649,7 +652,8 @@ class ExtractionService:
# Apply existing merging logic using the sophisticated merging system
merged_parts = _applyMerging(content_parts, merge_strategy)
from modules.interfaces.interfaceAiObjects import applyMerging
merged_parts = applyMerging(content_parts, merge_strategy)
# Convert merged parts back to final string
final_content = "\n\n".join([part.data for part in merged_parts])

View file

@ -4,55 +4,11 @@ import logging
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy
from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .merging.mergerText import TextMerger
from .merging.mergerTable import TableMerger
from .merging.mergerDefault import DefaultMerger
from .subMerger import IntelligentTokenAwareMerger
logger = logging.getLogger(__name__)
def _mergeParts(parts: List[ContentPart], mergeStrategy: MergeStrategy) -> List[ContentPart]:
"""Merge parts based on the provided strategy."""
if not parts or not mergeStrategy:
return parts
groupBy = mergeStrategy.groupBy
orderBy = mergeStrategy.orderBy
# Group parts by the specified field
groups = {}
for part in parts:
key = getattr(part, groupBy, "unknown")
if key not in groups:
groups[key] = []
groups[key].append(part)
# Merge each group
merged_parts = []
for group_key, group_parts in groups.items():
if len(group_parts) == 1:
merged_parts.extend(group_parts)
else:
# Sort by orderBy field if specified
if orderBy:
group_parts.sort(key=lambda p: getattr(p, orderBy, ""))
# Use appropriate merger based on type
type_group = group_parts[0].typeGroup if group_parts else "unknown"
if type_group == "text":
merger = TextMerger()
elif type_group == "table":
merger = TableMerger()
else:
merger = DefaultMerger()
# Merge the group
merged = merger.merge(group_parts, mergeStrategy)
merged_parts.extend(merged)
return merged_parts
# REMOVED: _mergeParts function - unused, functionality replaced by applyMerging in interfaceAiObjects.py
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted:
@ -78,69 +34,12 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
# Apply merging strategy if provided (preserve existing logic)
if options.mergeStrategy:
parts = _applyMerging(parts, options.mergeStrategy)
from modules.interfaces.interfaceAiObjects import applyMerging
parts = applyMerging(parts, options.mergeStrategy)
return ContentExtracted(id=makeId(), parts=parts)
# REMOVED: poolAndLimit function - chunking now handled in AI call phase
def _applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""Apply merging strategy to parts with intelligent token-aware merging."""
logger.debug(f"_applyMerging called with {len(parts)} parts")
# Check if intelligent merging is enabled
if strategy.useIntelligentMerging:
model_capabilities = strategy.capabilities or {}
subMerger = IntelligentTokenAwareMerger(model_capabilities)
# Use intelligent merging for all parts
merged = subMerger.mergeChunksIntelligently(parts, strategy.prompt or "")
# Calculate and log optimization stats
stats = subMerger.calculateOptimizationStats(parts, merged)
logger.info(f"🧠 Intelligent merging stats: {stats}")
logger.debug(f"Intelligent merging: {stats['original_ai_calls']}{stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
return merged
# Fallback to traditional merging
textMerger = TextMerger()
tableMerger = TableMerger()
defaultMerger = DefaultMerger()
# Group by typeGroup
textParts = [p for p in parts if p.typeGroup == "text"]
tableParts = [p for p in parts if p.typeGroup == "table"]
structureParts = [p for p in parts if p.typeGroup == "structure"]
otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
merged: List[ContentPart] = []
if textParts:
textMerged = textMerger.merge(textParts, strategy)
logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
merged.extend(textMerged)
if tableParts:
tableMerged = tableMerger.merge(tableParts, strategy)
logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
merged.extend(tableMerged)
if structureParts:
# For now, treat structure like text
structureMerged = textMerger.merge(structureParts, strategy)
logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
merged.extend(structureMerged)
if otherParts:
otherMerged = defaultMerger.merge(otherParts, strategy)
logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
merged.extend(otherMerged)
logger.debug(f"_applyMerging returning {len(merged)} parts")
return merged
# REMOVED: _applySizeLimit function - no longer needed after removing poolAndLimit
# REMOVED: applyMerging function - moved to interfaceAiObjects.py for proper interface-level access

View file

@ -61,27 +61,58 @@ class RendererImage(BaseRenderer):
# Save image generation prompt to debug
aiService.services.utils.writeDebugFile(imagePrompt, "image_generation_prompt")
# Generate image using AI
imageResult = await aiService.aiObjects.generateImage(
# Format prompt as JSON with image generation parameters
from modules.datamodels.datamodelAi import AiCallPromptImage, AiCallOptions, OperationTypeEnum
import json
promptModel = AiCallPromptImage(
prompt=imagePrompt,
size="1024x1024",
quality="standard",
style="vivid"
)
promptJson = promptModel.model_dump_json(exclude_none=True, indent=2)
# Use generic path via callAiDocuments
options = AiCallOptions(
operationType=OperationTypeEnum.IMAGE_GENERATE,
resultFormat="base64"
)
# Call via generic path
imageResult = await aiService.callAiDocuments(
prompt=promptJson,
documents=None,
options=options,
outputFormat="base64"
)
# Save image generation response to debug
aiService.services.utils.writeDebugFile(str(imageResult), "image_generation_response")
# Extract base64 image data from result
if imageResult and imageResult.get("success", False):
imageData = imageResult.get("image_data", "")
if imageData:
return imageData
else:
# The generic path returns a dict with documents array for base64 format
if isinstance(imageResult, dict):
if imageResult.get("success", False):
# Check if it's the new format with documents array
documents = imageResult.get("documents", [])
if documents and len(documents) > 0:
imageData = documents[0].get("documentData", "")
if imageData:
return imageData
# Fallback: check for image_data field
imageData = imageResult.get("image_data", "")
if imageData:
return imageData
raise ValueError("No image data returned from AI")
else:
errorMsg = imageResult.get("error", "Unknown error")
raise ValueError(f"AI image generation failed: {errorMsg}")
elif isinstance(imageResult, str):
# If it's just a string, it might be base64 data directly
return imageResult
else:
errorMsg = imageResult.get("error", "Unknown error") if imageResult else "No result"
raise ValueError(f"AI image generation failed: {errorMsg}")
raise ValueError(f"Unexpected image generation result format: {type(imageResult)}")
except Exception as e:
self.logger.error(f"Error generating AI image: {str(e)}")

View file

@ -93,10 +93,10 @@ Instructions:
- Arrays must contain ONLY JSON values; do not include comments or ellipses.
- Use ONLY the element structures shown in the template.
- Continue from where it stopped add NEW items only; do not repeat existing items.
- Generate all remaining content needed to complete the user request.
- Generate remaining content to complete the user request.
- Fill with actual content (no placeholders or instructional text such as "Add more...").
- When fully complete, add "complete_response": true at root level.
- Output JSON only; no markdown fences or extra text before/after.
- When the request is fully satisfied, add "complete_response": true at root level.
- Output JSON only; no markdown fences or extra text.
Continue generating:
"""
@ -110,14 +110,12 @@ JSON structure template:
{jsonTemplate}
Instructions:
- Start your response with {{"metadata": ...}} return COMPLETE, STRICT JSON.
- Return ONLY valid JSON (strict). No comments of any kind (no //, /* */, or #). No trailing commas. Strings must use double quotes.
- Arrays must contain ONLY JSON values; do not include comments or ellipses.
- Do NOT reuse the example section IDs from the template; create your own.
- Use ONLY the element structures shown in the template.
- Start with {{"metadata": ...}} return COMPLETE, STRICT JSON.
- Return ONLY valid JSON (strict). No comments. No trailing commas. Use double quotes.
- Do NOT reuse example section IDs; create your own.
- Generate complete content based on the user request.
- When fully complete, add "complete_response": true at root level.
- Output JSON only; no markdown fences or any additional text.
- When the request is fully satisfied, add "complete_response": true at root level.
- Output JSON only; no markdown fences or extra text.
Generate your complete response starting from {{"metadata": ...}}:
"""

View file

@ -26,7 +26,7 @@ class WebService:
language: Optional[str],
researchDepth: str = "general",
operationId: str = None
) -> Dict[str, Any]:
) -> Dict[str, Any]:
"""
Perform web research in two steps:
1. Use AI to analyze prompt and extract parameters + URLs
@ -128,7 +128,7 @@ class WebService:
country: Optional[str],
language: Optional[str],
researchDepth: str = "general"
) -> Dict[str, Any]:
) -> Dict[str, Any]:
"""
Use AI to analyze prompt and extract:
- URLs from the prompt text
@ -195,7 +195,7 @@ Return ONLY valid JSON, no additional text:
maxNumberPages: int,
country: Optional[str],
language: Optional[str]
) -> List[str]:
) -> List[str]:
"""Perform web search to find URLs."""
try:
# Build search prompt model
@ -248,7 +248,7 @@ Return ONLY valid JSON, no additional text:
instruction: str,
urls: List[str],
maxDepth: int = 2
) -> List[Dict[str, Any]]:
) -> List[Dict[str, Any]]:
"""Perform web crawl on list of URLs - calls plugin for each URL individually."""
crawlResults = []

View file

@ -23,6 +23,7 @@ class ProgressLogger:
self.workflowService = workflowService
self.workflow = workflow
self.activeOperations = {}
self.finishedOperations = set() # Track finished operations to avoid repeated warnings
def startOperation(self, operationId: str, serviceName: str, actionName: str, context: str = ""):
"""Start a new long-running operation.
@ -33,6 +34,9 @@ class ProgressLogger:
actionName: Name of the action being performed
context: Additional context information
"""
# Remove from finished operations if it was there (for restart scenarios)
self.finishedOperations.discard(operationId)
self.activeOperations[operationId] = {
'service': serviceName,
'action': actionName,
@ -51,7 +55,14 @@ class ProgressLogger:
statusUpdate: Additional status information
"""
if operationId not in self.activeOperations:
logger.warning(f"Operation {operationId} not found for progress update")
# Only warn once per operation ID if it was already finished
if operationId in self.finishedOperations:
# Operation already finished - silently ignore subsequent updates
return
else:
# Operation never started - log warning once and mark as problematic
logger.warning(f"Operation {operationId} not found for progress update (operation never started)")
self.finishedOperations.add(operationId) # Prevent repeated warnings
return
op = self.activeOperations[operationId]
@ -67,7 +78,10 @@ class ProgressLogger:
success: Whether the operation completed successfully
"""
if operationId not in self.activeOperations:
logger.warning(f"Operation {operationId} not found for completion")
# Only warn once if operation was already finished
if operationId not in self.finishedOperations:
logger.warning(f"Operation {operationId} not found for completion (operation never started or already finished)")
self.finishedOperations.add(operationId)
return
op = self.activeOperations[operationId]
@ -83,6 +97,9 @@ class ProgressLogger:
# Remove from active operations AFTER creating the log
del self.activeOperations[operationId]
# Mark as finished to prevent repeated warnings from updateOperation calls
self.finishedOperations.add(operationId)
def _logProgress(self, operationId: str, progress: float, status: str):
"""Create standardized ChatLog entry.
@ -122,4 +139,5 @@ class ProgressLogger:
def clearAllOperations(self):
"""Clear all active operations (for cleanup)."""
self.activeOperations.clear()
self.finishedOperations.clear()
logger.debug("Cleared all active operations")

View file

@ -121,6 +121,15 @@ class MethodAi(MethodBase):
mimeType=d.get("mimeType") or output_mime_type
))
# Preserve structured content field for validation (if it exists)
# This allows validator to see the actual structured data, not just rendered output
if "content" in result and result["content"] and isinstance(result["content"], (dict, list)):
action_documents.append(ActionDocument(
documentName="structured_content.json",
documentData=result["content"],
mimeType="application/json"
))
final_documents = action_documents
else:
extension = output_extension.lstrip('.')
@ -222,74 +231,3 @@ class MethodAi(MethodBase):
except:
pass
return ActionResult.isFailure(error=str(e))
@action
async def generateImage(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Generate images using AI based on text prompts.
- Input requirements: prompt (required); optional size, quality, style.
- Output format: Base64 encoded image data.
Parameters:
- prompt (str, required): Text description of the image to generate.
- size (str, optional): Image size. Options: 1024x1024, 1792x1024, 1024x1792. Default: 1024x1024.
- quality (str, optional): Image quality. Options: standard, hd. Default: standard.
- style (str, optional): Image style. Options: vivid, natural. Default: vivid.
"""
try:
prompt = parameters.get("prompt")
if not prompt:
return ActionResult.isFailure(error="Image prompt is required")
# Extract optional parameters
size = parameters.get("size", "1024x1024")
quality = parameters.get("quality", "standard")
style = parameters.get("style", "vivid")
# Build AI call options for image generation
options = AiCallOptions(
operationType=OperationTypeEnum.IMAGE_GENERATE,
resultFormat="base64"
)
# Create structured prompt using Pydantic model
promptModel = AiCallPromptImage(
prompt=prompt,
size=size,
quality=quality,
style=style
)
# Convert to JSON string for prompt
promptJson = promptModel.model_dump_json(exclude_none=True, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(
prompt=promptJson,
documents=None,
options=options,
outputFormat="base64"
)
# Create meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="generated_image",
extension="png",
action_name="generate"
)
from modules.datamodels.datamodelChat import ActionDocument
actionDocument = ActionDocument(
documentName=meaningfulName,
documentData=result,
mimeType="image/png"
)
return ActionResult.isSuccess(documents=[actionDocument])
except Exception as e:
logger.error(f"Error in image generation: {str(e)}")
return ActionResult.isFailure(error=str(e))

View file

@ -0,0 +1,266 @@
# Content Validator - Deep Analysis & Target Design
## CURRENT STATE ANALYSIS
### How Validator Currently Works
#### 1. **Document Input Flow**
```
ActionResult.documents (List[ActionDocument])
→ modeReact.py extracts "structured content" with hardcoded checks
→ Creates SimpleNamespace objects with wrapped documentData
→ Passes to ContentValidator.validateContent()
```
#### 2. **Current Problems in modeReact.py (Lines 99-136)**
- ❌ **Hardcoded document name checks**: `docName == "structured_content.json"`
- ❌ **Hardcoded mimeType checks**: `mimeType == "application/json"`
- ❌ **Hardcoded structure checks**: `'content' in docData or 'documents' in docData or 'sections' in docData`
- ❌ **Single document selection**: `break` after first match - ignores other documents
- ❌ **Non-generic logic**: Specific to certain document structures
- ❌ **Workaround approach**: Trying to find structured content in various ways
#### 3. **Current Problems in contentValidator.py**
**`_extractContent()` method (Lines 21-41)**:
- ❌ **Inconsistent handling**: Checks for `dict with 'content'` but then also handles raw `data`
- ❌ **Silent failures**: Returns empty string on any exception
- ❌ **Size limit hardcoded**: 10KB threshold is arbitrary
- ❌ **No format awareness**: Doesn't check if document is binary/base64 before extracting
- ❌ **No document type detection**: Doesn't distinguish text vs binary vs structured data
**`_validateWithAI()` method (Lines 60-200)**:
- ❌ **Forces all content to string**: `content[:2000]` truncation assumes text
- ❌ **No document metadata passed**: Only name and content, no size, format, mimeType info
- ❌ **No binary/base64 handling**: Will fail or show garbage for binary documents
- ❌ **Multiple JSON extraction strategies**: Indicates unreliable AI response parsing
- ❌ **Size limits inconsistent**: 10KB in extraction, 2KB in prompt - why different?
#### 4. **Missing Capabilities**
- ❌ No document size reporting to validator
- ❌ No format validation (txt vs md vs pdf vs docx)
- ❌ No binary data handling (images, PDFs, etc.)
- ❌ No document count/summary statistics
- ❌ No distinction between document types for validation
---
## TARGET DESIGN
### Core Principles
1. **GENERIC**: No hardcoded document names, types, or structures
2. **DOCUMENT-AWARE**: Handle all document types (text, binary, base64, structured)
3. **SIZE-CONSCIOUS**: Never pass full large documents to AI
4. **METADATA-RICH**: Pass document metadata (name, size, format, mimeType) to validator
5. **FORMAT-FLEXIBLE**: Allow format flexibility (md ≈ text, but pdf ≠ docx)
### Target Architecture
```
Documents Input (List[ActionDocument])
Document Analyzer (generic)
- Extract metadata (name, size, mimeType, format)
- Determine content type (text/binary/base64/structured)
- Create preview/summary for large documents
Document Summary (for AI validation)
- Metadata only for binary/base64
- Preview/sample for large text documents
- Full content for small text/structured documents
Validation Prompt Builder (generic)
- Include document summaries (not full content)
- Include document metadata
- Include format validation rules (generic)
AI Validator
- Validates against task objective (generic)
- Validates format compliance (flexible)
- Validates document count/size
```
---
## REQUIRED CHANGES
### 1. **Remove All Hardcoded Checks from modeReact.py**
- ❌ Remove document name checks
- ❌ Remove mimeType-specific logic
- ❌ Remove structure-specific checks
- ✅ Pass ALL documents to validator (let validator decide what to validate)
- ✅ Keep it simple: `validationDocs = result.documents`
### 2. **Redesign contentValidator.py - New Structure**
#### New Method: `_analyzeDocuments(documents)`
```python
def _analyzeDocuments(self, documents: List[Any]) -> List[Dict[str, Any]]:
"""
Generic document analysis - extract metadata and create summaries.
Returns list of document summaries ready for validation prompt.
"""
summaries = []
for doc in documents:
summary = {
"name": getattr(doc, 'documentName', 'Unknown'),
"mimeType": getattr(doc, 'mimeType', 'unknown'),
"format": self._detectFormat(doc),
"size": self._calculateSize(doc),
"type": self._detectContentType(doc), # text/binary/base64/structured
"preview": self._createPreview(doc), # None for binary, sample for large text
"isAccessible": self._isContentAccessible(doc) # Can we read content?
}
summaries.append(summary)
return summaries
```
#### New Method: `_detectFormat(doc)`
- Extract from filename extension or mimeType
- Generic mapping: `text/plain``txt`, `text/markdown``md`, etc.
- Return format string (txt, md, pdf, docx, json, etc.)
#### New Method: `_calculateSize(doc)`
- Calculate document size in bytes
- Handle string, dict, list, bytes, base64
- Return: `{"bytes": int, "readable": "1.5 MB"}`
#### New Method: `_detectContentType(doc)`
- `text`: Readable text content
- `structured`: JSON/dict/list structures
- `binary`: Binary data (PDF, images, etc.)
- `base64`: Base64-encoded data
- Return content type string
#### New Method: `_createPreview(doc)`
- **Binary/Base64**: Return `None` (metadata only)
- **Large text (>50KB)**: Return first 1KB + size indicator
- **Small text (≤50KB)**: Return full content
- **Structured data**: Return JSON string (truncated if large)
#### New Method: `_isContentAccessible(doc)`
- Check if document content can be extracted for validation
- Binary/base64 documents: `False` (validate by metadata only)
- Text/structured documents: `True`
### 3. **Redesign Validation Prompt (Generic)**
```python
validationPrompt = f"""TASK VALIDATION
USER REQUEST: '{intent.get('primaryGoal', 'Unknown')}'
EXPECTED DATA TYPE: {intent.get('dataType', 'unknown')}
EXPECTED FORMAT: {intent.get('expectedFormat', 'unknown')}
SUCCESS CRITERIA ({criteriaCount} items): {successCriteria}
DELIVERED DOCUMENTS ({len(documentSummaries)} items):
{json.dumps(documentSummaries, indent=2)}
VALIDATION RULES:
1. Check if delivered documents match expected data type
2. Check if delivered formats are compatible with expected format
(Note: text formats like txt/md are compatible; pdf ≠ docx but both are documents)
3. Verify each success criterion is met based on document content/metadata
4. Check document sizes are reasonable for the task
5. Rate overall quality (0.0-1.0)
6. Identify specific gaps
7. Suggest next steps
OUTPUT FORMAT - JSON ONLY (no prose):
{{
"overallSuccess": false,
"qualityScore": 0.0,
"dataTypeMatch": false,
"formatMatch": false,
"documentCount": {len(documentSummaries)},
"successCriteriaMet": {[False] * criteriaCount},
"gapAnalysis": "Specific gaps found",
"improvementSuggestions": ["NEXT STEP: Action 1"],
"validationDetails": [
{{
"documentName": "document.ext",
"issues": ["Issue 1"],
"suggestions": ["NEXT STEP: Fix 1"]
}}
]
}}
"""
```
### 4. **Format Validation Logic (Generic & Flexible)**
```python
def _isFormatCompatible(self, deliveredFormat: str, expectedFormat: str) -> bool:
"""
Generic format compatibility check.
- txt/md/html are text formats (compatible with each other)
- pdf/docx/xlsx are document formats (not compatible with each other)
- json/xml are structured formats
- images are image formats
"""
# Text formats are interchangeable
textFormats = ['txt', 'md', 'html', 'text', 'plain']
if deliveredFormat.lower() in textFormats and expectedFormat.lower() in textFormats:
return True
# Exact match
if deliveredFormat.lower() == expectedFormat.lower():
return True
# Structured formats
if deliveredFormat.lower() in ['json', 'xml'] and expectedFormat.lower() in ['json', 'xml']:
return True # Could be made more flexible
return False
```
---
## IMPLEMENTATION PLAN
### Phase 1: Clean Up modeReact.py
- Remove all hardcoded checks
- Simply pass `result.documents` to validator
### Phase 2: Redesign Document Analysis
- Implement `_analyzeDocuments()`
- Implement helper methods: `_detectFormat()`, `_calculateSize()`, `_detectContentType()`, `_createPreview()`
### Phase 3: Redesign Validation Prompt
- Generic prompt with document summaries
- Include metadata, not full content
- Size-aware handling
### Phase 4: Implement Format Validation
- Generic format compatibility logic
- Flexible matching (text formats, document formats, etc.)
### Phase 5: Testing
- Test with text documents (small & large)
- Test with binary documents (PDF, images)
- Test with base64 documents
- Test with structured data (JSON)
---
## KEY DESIGN DECISIONS
1. **Pass ALL documents**: Validator decides what to validate, not the caller
2. **Metadata over content**: For large/binary documents, pass metadata only
3. **Preview samples**: For large text documents, pass preview + size info
4. **Generic prompts**: No task-specific or format-specific logic
5. **Flexible format matching**: Text formats compatible, document formats strict
6. **Size limits**: 50KB threshold for full content (configurable)
7. **Content type detection**: Explicit type detection (text/binary/base64/structured)
---
## BENEFITS OF TARGET DESIGN
**Generic**: Works with any document type without hardcoding
**Scalable**: Handles large documents without issues
**Flexible**: Format validation is flexible where appropriate
**Maintainable**: Clear separation of concerns
**Robust**: Handles edge cases (binary, base64, large files)
**Testable**: Each component can be tested independently

View file

@ -1,14 +1,22 @@
# contentValidator.py
# Content validation for adaptive React mode
# Generic, document-aware validation system
import logging
import json
from typing import List, Dict, Any
import base64
import re
from typing import List, Dict, Any, Optional
logger = logging.getLogger(__name__)
# Configuration constants
MAX_CONTENT_SIZE_FOR_FULL_PREVIEW = 50 * 1024 # 50KB threshold
PREVIEW_SAMPLE_SIZE = 1024 # 1KB preview for large documents
class ContentValidator:
"""Validates delivered content against user intent"""
"""Validates delivered content against user intent - generic and document-aware"""
def __init__(self, services=None, learningEngine=None):
self.services = services
@ -18,78 +26,277 @@ class ContentValidator:
"""Validates delivered content against user intent using AI (single attempt; parse-or-fail)"""
return await self._validateWithAI(documents, intent)
def _extractContent(self, doc: Any) -> str:
"""Extracts content from a document with size protection for large documents"""
def _analyzeDocuments(self, documents: List[Any]) -> List[Dict[str, Any]]:
"""Generic document analysis - create simple summaries with metadata."""
summaries = []
for doc in documents:
try:
data = getattr(doc, 'documentData', None)
name = getattr(doc, 'documentName', 'Unknown')
mimeType = getattr(doc, 'mimeType', 'unknown')
formatExt = self._detectFormat(doc)
sizeInfo = self._calculateSize(doc)
# Simple preview: if it's dict/list, dump JSON; otherwise use string
preview = None
if data is not None:
if isinstance(data, (dict, list)):
preview = json.dumps(data, indent=2, ensure_ascii=False)
# Truncate if too large
if len(preview) > MAX_CONTENT_SIZE_FOR_FULL_PREVIEW:
preview = preview[:PREVIEW_SAMPLE_SIZE] + f"\n\n[Truncated - {self._formatBytes(sizeInfo['bytes'])} total]"
else:
text = str(data)
if len(text) > MAX_CONTENT_SIZE_FOR_FULL_PREVIEW:
preview = text[:PREVIEW_SAMPLE_SIZE] + f"\n\n[Truncated - {self._formatBytes(sizeInfo['bytes'])} total]"
else:
preview = text
summary = {
"name": name,
"mimeType": mimeType,
"format": formatExt,
"size": sizeInfo["readable"],
"preview": preview
}
summaries.append(summary)
except Exception as e:
logger.warning(f"Error analyzing document {getattr(doc, 'documentName', 'Unknown')}: {str(e)}")
summaries.append({
"name": getattr(doc, 'documentName', 'Unknown'),
"mimeType": getattr(doc, 'mimeType', 'unknown'),
"format": "unknown",
"size": "0 B",
"preview": None,
"error": str(e)
})
return summaries
def _calculateAvailablePromptSpace(self, basePromptSizeBytes: int) -> int:
"""Calculate available space for document summaries based on model context length."""
try:
if hasattr(doc, 'documentData'):
data = doc.documentData
if isinstance(data, dict) and 'content' in data:
content = data['content']
# For large content, check size before converting to string
if hasattr(content, '__len__') and len(str(content)) > 10000: # 10KB threshold
# For very large content, return a size indicator instead
return f"[Large document content - {len(str(content))} characters - truncated for validation]"
return str(content)
else:
content = data
# For large content, check size before converting to string
if hasattr(content, '__len__') and len(str(content)) > 10000: # 10KB threshold
return f"[Large document content - {len(str(content))} characters - truncated for validation]"
return str(content)
return ""
except Exception:
return ""
from modules.aicore.aicoreModelRegistry import modelRegistry
from modules.aicore.aicoreModelSelector import modelSelector
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
# Get available models
availableModels = modelRegistry.getAvailableModels()
# Create options for PLAN operation (what validation uses)
options = AiCallOptions(
operationType=OperationTypeEnum.PLAN,
priority=None,
processingMode=None
)
# Get failover model list to find the model that will be used
failoverModels = modelSelector.getFailoverModelList("", "", options, availableModels)
if not failoverModels:
# Fallback: assume 16K tokens context (conservative)
logger.warning("No models available for space calculation, using fallback: 16K tokens")
maxBytes = 16 * 1024 * 4 # 16K tokens * 4 bytes per token
else:
# Use the first (best) model
model = failoverModels[0]
# Calculate 80% of context length in bytes (tokens * 4 bytes per token)
maxBytes = int(model.contextLength * 0.8 * 4)
# Available space = max - base prompt - safety margin (10%)
availableBytes = int((maxBytes - basePromptSizeBytes) * 0.9)
# Ensure minimum available space (at least 1KB)
availableBytes = max(availableBytes, 1024)
logger.debug(f"Prompt space calculation: base={basePromptSizeBytes} bytes, max={maxBytes} bytes, available={availableBytes} bytes")
return availableBytes
except Exception as e:
logger.warning(f"Error calculating available prompt space: {str(e)}, using fallback: 8KB")
# Fallback: assume 8KB available
return 8 * 1024
# Removed schema fallback creator to keep failures explicit
def _analyzeDocumentsWithSizeLimit(self, documents: List[Any], maxTotalBytes: int) -> List[Dict[str, Any]]:
"""Analyze documents with size limit, dividing available space evenly among documents."""
if not documents:
return []
# Reserve space for JSON structure overhead (approximately 200 bytes per document)
jsonOverheadPerDoc = 200
reservedOverhead = len(documents) * jsonOverheadPerDoc
availableForContent = max(0, maxTotalBytes - reservedOverhead)
# Divide available space evenly among documents
bytesPerDoc = availableForContent // len(documents) if documents else 0
# Ensure minimum space per document (at least 100 bytes)
bytesPerDoc = max(bytesPerDoc, 100)
logger.debug(f"Document summary space: total={maxTotalBytes} bytes, available={availableForContent} bytes, perDoc={bytesPerDoc} bytes")
summaries = []
for doc in documents:
try:
data = getattr(doc, 'documentData', None)
name = getattr(doc, 'documentName', 'Unknown')
mimeType = getattr(doc, 'mimeType', 'unknown')
formatExt = self._detectFormat(doc)
sizeInfo = self._calculateSize(doc)
# Create preview with size limit
preview = None
if data is not None:
if isinstance(data, (dict, list)):
preview = json.dumps(data, indent=2, ensure_ascii=False)
else:
preview = str(data)
# Truncate preview to fit within bytesPerDoc (accounting for JSON structure)
# Estimate: preview takes ~70% of document summary space
maxPreviewBytes = int(bytesPerDoc * 0.7)
previewBytes = len(preview.encode('utf-8'))
if previewBytes > maxPreviewBytes:
# Truncate to fit
truncated = preview.encode('utf-8')[:maxPreviewBytes]
# Try to decode safely
try:
preview = truncated.decode('utf-8', errors='ignore')
except:
preview = truncated[:maxPreviewBytes-50].decode('utf-8', errors='ignore')
preview += f"\n\n[Truncated - {self._formatBytes(sizeInfo['bytes'])} total]"
summary = {
"name": name,
"mimeType": mimeType,
"format": formatExt,
"size": sizeInfo["readable"],
"preview": preview
}
summaries.append(summary)
except Exception as e:
logger.warning(f"Error analyzing document {getattr(doc, 'documentName', 'Unknown')}: {str(e)}")
summaries.append({
"name": getattr(doc, 'documentName', 'Unknown'),
"mimeType": getattr(doc, 'mimeType', 'unknown'),
"format": "unknown",
"size": "0 B",
"preview": None,
"error": str(e)
})
return summaries
def _isValidJsonResponse(self, response: str) -> bool:
"""Checks if response contains valid JSON structure"""
def _detectFormat(self, doc: Any) -> str:
"""Extract format from filename extension (always use extension)"""
try:
import re
# Look for JSON with expected structure
json_match = re.search(r'\{[^{}]*"overallSuccess"[^{}]*\}', response, re.DOTALL)
if json_match:
json.loads(json_match.group(0))
return True
return False
except:
return False
docName = getattr(doc, 'documentName', '')
# Extract from filename extension
if docName and '.' in docName:
ext = docName.rsplit('.', 1)[1].lower()
return ext
return 'unknown'
except Exception as e:
logger.warning(f"Error detecting format: {str(e)}")
return 'unknown'
# Removed text-based fallback extraction to avoid hiding issues
def _calculateSize(self, doc: Any) -> Dict[str, Any]:
"""Calculate document size in bytes and human-readable format"""
try:
if not hasattr(doc, 'documentData') or doc.documentData is None:
return {"bytes": 0, "readable": "0 B"}
data = doc.documentData
size_bytes = 0
if isinstance(data, str):
size_bytes = len(data.encode('utf-8'))
elif isinstance(data, bytes):
size_bytes = len(data)
elif isinstance(data, (dict, list)):
# Estimate JSON size
try:
json_str = json.dumps(data)
size_bytes = len(json_str.encode('utf-8'))
except:
size_bytes = len(str(data).encode('utf-8'))
else:
size_bytes = len(str(data).encode('utf-8'))
# Convert to human-readable format
readable = self._formatBytes(size_bytes)
return {"bytes": size_bytes, "readable": readable}
except Exception as e:
logger.warning(f"Error calculating size: {str(e)}")
return {"bytes": 0, "readable": "0 B"}
def _formatBytes(self, bytes: int) -> str:
"""Format bytes to human-readable string"""
for unit in ['B', 'KB', 'MB', 'GB']:
if bytes < 1024.0:
return f"{bytes:.1f} {unit}"
bytes /= 1024.0
return f"{bytes:.1f} TB"
def _isFormatCompatible(self, deliveredFormat: str, expectedFormat: str) -> bool:
"""
Generic format compatibility check.
- txt/md/html are text formats (compatible with each other)
- pdf/docx/xlsx are document formats (not compatible with each other)
- json/xml are structured formats
- images are image formats
"""
deliveredLower = deliveredFormat.lower()
expectedLower = expectedFormat.lower()
# Exact match
if deliveredLower == expectedLower:
return True
# Text formats are interchangeable
textFormats = ['txt', 'md', 'html', 'text', 'plain']
if deliveredLower in textFormats and expectedLower in textFormats:
return True
# Structured formats
if deliveredLower in ['json', 'xml'] and expectedLower in ['json', 'xml']:
return True
# Document formats are NOT compatible with each other
documentFormats = ['pdf', 'docx', 'xlsx', 'pptx']
if deliveredLower in documentFormats and expectedLower in documentFormats:
return False # pdf ≠ docx
return False
async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
"""AI-based comprehensive validation - single main function"""
"""AI-based comprehensive validation - generic approach"""
try:
if not hasattr(self, 'services') or not self.services or not hasattr(self.services, 'ai'):
return self._createFailedValidationResult("AI service not available")
# Extract content from all documents
documentContents = []
for doc in documents:
content = self._extractContent(doc)
documentContents.append({
"name": getattr(doc, 'documentName', 'Unknown'),
"content": content[:2000] # Limit content for AI processing
})
# Create structured AI validation prompt
# Build prompt base WITHOUT document summaries first
successCriteria = intent.get('successCriteria', [])
criteriaCount = len(successCriteria)
validationPrompt = f"""TASK VALIDATION
promptBase = f"""TASK VALIDATION
USER REQUEST: '{intent.get('primaryGoal', 'Unknown')}'
EXPECTED TYPE: {intent.get('dataType', 'unknown')}
EXPECTED DATA TYPE: {intent.get('dataType', 'unknown')}
EXPECTED FORMAT: {intent.get('expectedFormat', 'unknown')}
SUCCESS CRITERIA ({criteriaCount} items): {successCriteria}
VALIDATION RULES:
1. Check if content matches expected data type
2. Check if content matches expected format
3. Verify each success criterion is met
4. Rate overall quality (0.0-1.0)
5. Identify specific gaps
6. Suggest next steps
1. Check if delivered documents match expected data type
2. Check if delivered formats are compatible with expected format
3. Verify each success criterion is met based on document content/metadata
4. Check document sizes are reasonable for the task
5. Rate overall quality (0.0-1.0)
6. Identify specific gaps based on what the user requested
OUTPUT FORMAT - JSON ONLY (no prose):
{{
@ -97,31 +304,45 @@ OUTPUT FORMAT - JSON ONLY (no prose):
"qualityScore": 0.0,
"dataTypeMatch": false,
"formatMatch": false,
"documentCount": {len(documents)},
"successCriteriaMet": {[False] * criteriaCount},
"gapAnalysis": "Specific gaps found",
"improvementSuggestions": ["NEXT STEP: Action 1", "NEXT STEP: Action 2"],
"gapAnalysis": "Describe what is missing or incorrect",
"improvementSuggestions": ["General action to improve overall result"],
"validationDetails": [
{{
"documentName": "Document Name",
"issues": ["Issue 1", "Issue 2"],
"suggestions": ["NEXT STEP: Fix 1", "NEXT STEP: Fix 2"]
"documentName": "document.ext",
"issues": ["Specific problem with this document"],
"suggestions": ["Specific fix for this document's issues"]
}}
]
}}
DELIVERED CONTENT TO CHECK:
{json.dumps(documentContents, indent=2)}
Field explanations:
- "improvementSuggestions": Overall actions to improve the entire result (general, high-level)
- "validationDetails[].suggestions": Specific fixes for each document's individual issues (document-specific, detailed)
- Do NOT use prefixes like "NEXT STEP:" - describe actions directly
DELIVERED DOCUMENTS ({len(documents)} items):
"""
# Calculate available space for document summaries
# Get the model that will be used for validation
basePromptSize = len(promptBase.encode('utf-8'))
availableBytes = self._calculateAvailablePromptSpace(basePromptSize)
# Analyze documents with size constraints
documentSummaries = self._analyzeDocumentsWithSizeLimit(documents, availableBytes)
# Build final prompt with summaries at the end
documentsJson = json.dumps(documentSummaries, indent=2)
validationPrompt = promptBase + documentsJson
# Call AI service for validation
response = await self.services.ai.callAiPlanning(
prompt=validationPrompt,
placeholders=None
)
# No retries or correction prompts here; parse-or-fail below
if not response or not response.strip():
logger.warning("AI validation returned empty response")
raise ValueError("AI validation failed - empty response")
@ -131,8 +352,6 @@ DELIVERED CONTENT TO CHECK:
logger.debug(f"AI validation response length: {len(result)}")
# Try to find JSON in the response with multiple strategies
import re
# Strategy 1: Look for JSON in markdown code blocks
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result, re.DOTALL)
if json_match:
@ -168,6 +387,7 @@ DELIVERED CONTENT TO CHECK:
normalized = {
"overallSuccess": overall if isinstance(overall, bool) else None,
"qualityScore": float(quality) if isinstance(quality, (int, float)) else None,
"documentCount": len(documentSummaries),
"validationDetails": details if isinstance(details, list) else [{
"documentName": "AI Validation",
"gapAnalysis": gap,
@ -197,4 +417,22 @@ DELIVERED CONTENT TO CHECK:
except Exception as e:
logger.error(f"AI validation failed: {str(e)}")
raise
raise
def _createFailedValidationResult(self, errorMessage: str) -> Dict[str, Any]:
"""Create a standardized failed validation result"""
return {
"overallSuccess": False,
"qualityScore": 0.0,
"dataTypeMatch": False,
"formatMatch": False,
"documentCount": 0,
"successCriteriaMet": [],
"gapAnalysis": errorMessage,
"improvementSuggestions": [],
"validationDetails": [],
"schemaCompliant": True,
"originalType": "error",
"missingFields": [],
"error": errorMessage
}

View file

@ -96,14 +96,10 @@ class ReactMode(BaseMode):
# Attach deterministic label for clarity
observation['resultLabel'] = result.resultLabel
# NEW: Add content validation (against original cleaned user prompt / workflow intent)
# Content validation (against original cleaned user prompt / workflow intent)
if getattr(self, 'workflowIntent', None) and result.documents:
# Validate ONLY the produced JSON (structured content), not rendered files
from types import SimpleNamespace
validationDocs = []
if hasattr(result, 'content') and result.content:
validationDocs.append(SimpleNamespace(documentName='generated.json', documentData={'content': result.content}))
validationResult = await self.contentValidator.validateContent(validationDocs, self.workflowIntent)
# Pass ALL documents to validator - validator decides what to validate (generic approach)
validationResult = await self.contentValidator.validateContent(result.documents, self.workflowIntent)
observation['contentValidation'] = validationResult
quality_score = validationResult.get('qualityScore', 0.0)
if quality_score is None:

View file

@ -1,107 +0,0 @@
Module,Function Names,Parameter Names,Variable Names,Total
modules/workflows/methods/methodSharepoint.py,0,2,211,213
modules/workflows/methods/methodOutlook.py,0,3,131,134
modules/services/serviceAi/subDocumentProcessing.py,0,0,104,104
modules/features/syncDelta/mainSyncDelta.py,1,10,88,99
modules/shared/jsonUtils.py,0,3,88,91
modules/services/serviceGeneration/renderers/rendererDocx.py,3,8,79,90
modules/services/serviceWorkflow/mainServiceWorkflow.py,0,3,85,88
modules/services/serviceGeneration/renderers/rendererPptx.py,2,7,73,82
modules/services/serviceGeneration/renderers/rendererPdf.py,3,8,50,61
modules/connectors/connectorVoiceGoogle.py,1,2,52,55
modules/services/serviceGeneration/renderers/rendererHtml.py,3,6,46,55
modules/services/serviceGeneration/renderers/rendererBaseTemplate.py,3,21,27,51
modules/shared/configuration.py,2,17,30,49
modules/services/serviceExtraction/subMerger.py,2,5,31,38
modules/connectors/connectorDbPostgre.py,0,14,20,34
modules/interfaces/interfaceDbAppObjects.py,0,8,26,34
modules/routes/routeSecurityGoogle.py,0,0,32,32
modules/shared/attributeUtils.py,3,4,25,32
modules/interfaces/interfaceDbChatObjects.py,0,4,27,31
modules/routes/routeSecurityAdmin.py,0,2,28,30
modules/services/serviceNeutralization/subProcessList.py,7,0,22,29
modules/services/serviceGeneration/renderers/rendererText.py,3,7,19,29
modules/routes/routeSecurityMsft.py,0,0,27,27
modules/services/serviceGeneration/renderers/rendererMarkdown.py,3,7,17,27
modules/services/serviceGeneration/renderers/rendererXlsx.py,3,0,24,27
modules/services/serviceGeneration/renderers/rendererImage.py,3,2,21,26
modules/security/tokenManager.py,4,7,14,25
modules/workflows/workflowManager.py,0,0,25,25
modules/services/serviceGeneration/renderers/rendererCsv.py,3,5,17,25
modules/shared/auditLogger.py,5,16,3,24
modules/shared/debugLogger.py,0,0,24,24
modules/workflows/processing/shared/placeholderFactory.py,0,0,24,24
modules/interfaces/interfaceDbAppAccess.py,0,2,21,23
modules/connectors/connectorTicketsJira.py,0,0,22,22
modules/services/serviceGeneration/renderers/registry.py,7,3,12,22
modules/routes/routeDataConnections.py,1,1,19,21
modules/security/tokenRefreshService.py,0,2,19,21
modules/services/serviceExtraction/extractors/extractorPptx.py,0,1,16,17
modules/routes/routeSecurityLocal.py,0,0,16,16
modules/workflows/methods/methodBase.py,0,4,12,16
modules/services/serviceGeneration/mainServiceGeneration.py,0,4,11,15
modules/services/serviceUtils/mainServiceUtils.py,0,14,1,15
modules/features/neutralizePlayground/mainNeutralizePlayground.py,8,5,2,15
modules/interfaces/interfaceTicketObjects.py,0,5,9,14
modules/services/serviceNeutralization/subParseString.py,7,0,6,13
modules/workflows/processing/modes/modeReact.py,0,1,11,12
modules/interfaces/interfaceDbComponentAccess.py,0,2,9,11
modules/services/serviceAi/subCoreAi.py,0,0,11,11
modules/services/serviceExtraction/subRegistry.py,0,0,11,11
modules/services/serviceNeutralization/mainServiceNeutralization.py,0,2,9,11
modules/interfaces/interfaceAiObjects.py,0,0,10,10
modules/services/serviceAi/subSharedAiUtils.py,0,3,7,10
modules/connectors/connectorDbJson.py,0,3,6,9
modules/workflows/methods/methodAi.py,0,0,9,9
modules/services/serviceExtraction/subPromptBuilderExtraction.py,0,0,9,9
modules/services/serviceGeneration/subDocumentUtility.py,0,3,6,9
modules/services/serviceNeutralization/subProcessCommon.py,7,2,0,9
modules/services/serviceNeutralization/subProcessText.py,5,0,4,9
modules/interfaces/interfaceDbChatAccess.py,0,2,6,8
modules/security/auth.py,0,1,7,8
modules/aicore/aicorePluginAnthropic.py,0,0,7,7
modules/security/tokenRefreshMiddleware.py,0,2,4,6
modules/services/serviceGeneration/renderers/rendererJson.py,3,0,3,6
analyze_naming_violations.py,5,0,0,5
modules/aicore/aicorePluginOpenai.py,0,0,5,5
modules/routes/routeVoiceGoogle.py,0,0,5,5
modules/shared/eventManagement.py,2,3,0,5
modules/workflows/processing/adaptive/intentAnalyzer.py,0,0,5,5
modules/workflows/processing/shared/executionState.py,0,5,0,5
modules/services/serviceGeneration/subJsonSchema.py,0,0,5,5
modules/services/serviceNeutralization/subPatterns.py,5,0,0,5
modules/services/serviceNeutralization/subProcessBinary.py,4,0,1,5
modules/services/serviceExtraction/extractors/extractorXlsx.py,0,0,5,5
modules/interfaces/interfaceDbComponentObjects.py,0,3,1,4
modules/routes/routeDataNeutralization.py,0,0,4,4
modules/routes/routeWorkflows.py,0,0,4,4
modules/shared/timezoneUtils.py,3,1,0,4
modules/workflows/processing/adaptive/contentValidator.py,0,0,4,4
modules/workflows/processing/core/messageCreator.py,0,0,4,4
modules/services/serviceSharepoint/mainServiceSharepoint.py,0,0,4,4
modules/routes/routeDataUsers.py,0,0,3,3
modules/services/serviceExtraction/subPipeline.py,0,0,3,3
app.py,0,0,2,2
modules/datamodels/datamodelChat.py,0,1,1,2
modules/routes/routeAttributes.py,0,0,2,2
modules/routes/routeDataPrompts.py,0,0,2,2
modules/security/csrf.py,0,1,1,2
modules/security/jwtService.py,0,0,2,2
modules/workflows/processing/adaptive/learningEngine.py,0,0,2,2
modules/workflows/processing/modes/modeActionplan.py,0,0,2,2
modules/workflows/processing/shared/methodDiscovery.py,0,0,2,2
modules/services/serviceNormalization/mainServiceNormalization.py,0,0,2,2
modules/services/serviceExtraction/extractors/extractorImage.py,0,0,2,2
modules/aicore/aicoreBase.py,0,0,1,1
modules/aicore/aicoreModelSelector.py,0,0,1,1
modules/connectors/connectorTicketsClickup.py,0,0,1,1
modules/datamodels/datamodelDocument.py,0,1,0,1
modules/datamodels/datamodelSecurity.py,0,0,1,1
modules/routes/routeAdmin.py,0,0,1,1
modules/routes/routeDataFiles.py,0,0,1,1
modules/workflows/processing/workflowProcessor.py,0,0,1,1
modules/workflows/processing/adaptive/adaptiveLearningEngine.py,0,0,1,1
modules/workflows/processing/core/actionExecutor.py,0,0,1,1
modules/workflows/processing/core/taskPlanner.py,0,0,1,1
modules/workflows/processing/modes/modeBase.py,0,0,1,1
modules/services/serviceAi/subDocumentGeneration.py,0,0,1,1
1 Module Function Names Parameter Names Variable Names Total
2 modules/workflows/methods/methodSharepoint.py 0 2 211 213
3 modules/workflows/methods/methodOutlook.py 0 3 131 134
4 modules/services/serviceAi/subDocumentProcessing.py 0 0 104 104
5 modules/features/syncDelta/mainSyncDelta.py 1 10 88 99
6 modules/shared/jsonUtils.py 0 3 88 91
7 modules/services/serviceGeneration/renderers/rendererDocx.py 3 8 79 90
8 modules/services/serviceWorkflow/mainServiceWorkflow.py 0 3 85 88
9 modules/services/serviceGeneration/renderers/rendererPptx.py 2 7 73 82
10 modules/services/serviceGeneration/renderers/rendererPdf.py 3 8 50 61
11 modules/connectors/connectorVoiceGoogle.py 1 2 52 55
12 modules/services/serviceGeneration/renderers/rendererHtml.py 3 6 46 55
13 modules/services/serviceGeneration/renderers/rendererBaseTemplate.py 3 21 27 51
14 modules/shared/configuration.py 2 17 30 49
15 modules/services/serviceExtraction/subMerger.py 2 5 31 38
16 modules/connectors/connectorDbPostgre.py 0 14 20 34
17 modules/interfaces/interfaceDbAppObjects.py 0 8 26 34
18 modules/routes/routeSecurityGoogle.py 0 0 32 32
19 modules/shared/attributeUtils.py 3 4 25 32
20 modules/interfaces/interfaceDbChatObjects.py 0 4 27 31
21 modules/routes/routeSecurityAdmin.py 0 2 28 30
22 modules/services/serviceNeutralization/subProcessList.py 7 0 22 29
23 modules/services/serviceGeneration/renderers/rendererText.py 3 7 19 29
24 modules/routes/routeSecurityMsft.py 0 0 27 27
25 modules/services/serviceGeneration/renderers/rendererMarkdown.py 3 7 17 27
26 modules/services/serviceGeneration/renderers/rendererXlsx.py 3 0 24 27
27 modules/services/serviceGeneration/renderers/rendererImage.py 3 2 21 26
28 modules/security/tokenManager.py 4 7 14 25
29 modules/workflows/workflowManager.py 0 0 25 25
30 modules/services/serviceGeneration/renderers/rendererCsv.py 3 5 17 25
31 modules/shared/auditLogger.py 5 16 3 24
32 modules/shared/debugLogger.py 0 0 24 24
33 modules/workflows/processing/shared/placeholderFactory.py 0 0 24 24
34 modules/interfaces/interfaceDbAppAccess.py 0 2 21 23
35 modules/connectors/connectorTicketsJira.py 0 0 22 22
36 modules/services/serviceGeneration/renderers/registry.py 7 3 12 22
37 modules/routes/routeDataConnections.py 1 1 19 21
38 modules/security/tokenRefreshService.py 0 2 19 21
39 modules/services/serviceExtraction/extractors/extractorPptx.py 0 1 16 17
40 modules/routes/routeSecurityLocal.py 0 0 16 16
41 modules/workflows/methods/methodBase.py 0 4 12 16
42 modules/services/serviceGeneration/mainServiceGeneration.py 0 4 11 15
43 modules/services/serviceUtils/mainServiceUtils.py 0 14 1 15
44 modules/features/neutralizePlayground/mainNeutralizePlayground.py 8 5 2 15
45 modules/interfaces/interfaceTicketObjects.py 0 5 9 14
46 modules/services/serviceNeutralization/subParseString.py 7 0 6 13
47 modules/workflows/processing/modes/modeReact.py 0 1 11 12
48 modules/interfaces/interfaceDbComponentAccess.py 0 2 9 11
49 modules/services/serviceAi/subCoreAi.py 0 0 11 11
50 modules/services/serviceExtraction/subRegistry.py 0 0 11 11
51 modules/services/serviceNeutralization/mainServiceNeutralization.py 0 2 9 11
52 modules/interfaces/interfaceAiObjects.py 0 0 10 10
53 modules/services/serviceAi/subSharedAiUtils.py 0 3 7 10
54 modules/connectors/connectorDbJson.py 0 3 6 9
55 modules/workflows/methods/methodAi.py 0 0 9 9
56 modules/services/serviceExtraction/subPromptBuilderExtraction.py 0 0 9 9
57 modules/services/serviceGeneration/subDocumentUtility.py 0 3 6 9
58 modules/services/serviceNeutralization/subProcessCommon.py 7 2 0 9
59 modules/services/serviceNeutralization/subProcessText.py 5 0 4 9
60 modules/interfaces/interfaceDbChatAccess.py 0 2 6 8
61 modules/security/auth.py 0 1 7 8
62 modules/aicore/aicorePluginAnthropic.py 0 0 7 7
63 modules/security/tokenRefreshMiddleware.py 0 2 4 6
64 modules/services/serviceGeneration/renderers/rendererJson.py 3 0 3 6
65 analyze_naming_violations.py 5 0 0 5
66 modules/aicore/aicorePluginOpenai.py 0 0 5 5
67 modules/routes/routeVoiceGoogle.py 0 0 5 5
68 modules/shared/eventManagement.py 2 3 0 5
69 modules/workflows/processing/adaptive/intentAnalyzer.py 0 0 5 5
70 modules/workflows/processing/shared/executionState.py 0 5 0 5
71 modules/services/serviceGeneration/subJsonSchema.py 0 0 5 5
72 modules/services/serviceNeutralization/subPatterns.py 5 0 0 5
73 modules/services/serviceNeutralization/subProcessBinary.py 4 0 1 5
74 modules/services/serviceExtraction/extractors/extractorXlsx.py 0 0 5 5
75 modules/interfaces/interfaceDbComponentObjects.py 0 3 1 4
76 modules/routes/routeDataNeutralization.py 0 0 4 4
77 modules/routes/routeWorkflows.py 0 0 4 4
78 modules/shared/timezoneUtils.py 3 1 0 4
79 modules/workflows/processing/adaptive/contentValidator.py 0 0 4 4
80 modules/workflows/processing/core/messageCreator.py 0 0 4 4
81 modules/services/serviceSharepoint/mainServiceSharepoint.py 0 0 4 4
82 modules/routes/routeDataUsers.py 0 0 3 3
83 modules/services/serviceExtraction/subPipeline.py 0 0 3 3
84 app.py 0 0 2 2
85 modules/datamodels/datamodelChat.py 0 1 1 2
86 modules/routes/routeAttributes.py 0 0 2 2
87 modules/routes/routeDataPrompts.py 0 0 2 2
88 modules/security/csrf.py 0 1 1 2
89 modules/security/jwtService.py 0 0 2 2
90 modules/workflows/processing/adaptive/learningEngine.py 0 0 2 2
91 modules/workflows/processing/modes/modeActionplan.py 0 0 2 2
92 modules/workflows/processing/shared/methodDiscovery.py 0 0 2 2
93 modules/services/serviceNormalization/mainServiceNormalization.py 0 0 2 2
94 modules/services/serviceExtraction/extractors/extractorImage.py 0 0 2 2
95 modules/aicore/aicoreBase.py 0 0 1 1
96 modules/aicore/aicoreModelSelector.py 0 0 1 1
97 modules/connectors/connectorTicketsClickup.py 0 0 1 1
98 modules/datamodels/datamodelDocument.py 0 1 0 1
99 modules/datamodels/datamodelSecurity.py 0 0 1 1
100 modules/routes/routeAdmin.py 0 0 1 1
101 modules/routes/routeDataFiles.py 0 0 1 1
102 modules/workflows/processing/workflowProcessor.py 0 0 1 1
103 modules/workflows/processing/adaptive/adaptiveLearningEngine.py 0 0 1 1
104 modules/workflows/processing/core/actionExecutor.py 0 0 1 1
105 modules/workflows/processing/core/taskPlanner.py 0 0 1 1
106 modules/workflows/processing/modes/modeBase.py 0 0 1 1
107 modules/services/serviceAi/subDocumentGeneration.py 0 0 1 1

View file

@ -1,184 +0,0 @@
# Analysis: `processDocumentsWithContinuation` and Subfunctions Usage
## Executive Summary
**FINDING**: The function `processDocumentsWithContinuation` in `subDocumentProcessing.py` is **NOT USED** anywhere in the active codebase. The continuation chain was only referenced by the deleted `subDocumentGeneration.py` module.
---
## Main Function: `processDocumentsWithContinuation`
**Location**: `gateway/modules/services/serviceAi/subDocumentProcessing.py:303`
**Status**: ❌ **NOT USED**
### Usage Search Results
- ❌ No actual code calls to `.processDocumentsWithContinuation(`
- ⚠️ Only mentioned in documentation files:
- `wiki/poweron/appdoc/doc_system_function_relationship_ai.md` (documentation)
- `gateway/callAiWithDocumentGeneration_usage_analysis.md` (previous analysis - noted it was called by deleted code)
### Why It's Not Used
The only caller was `subDocumentGeneration._processDocumentsUnified()` which we just deleted. The current active codebase uses `subCoreAi.callAiDocuments()` which has its own continuation logic via `_callAiWithLooping()`.
---
## Function Call Chain Analysis
```
processDocumentsWithContinuation (line 303) - ❌ NOT USED
├─> _buildContinuationPrompt (line 319, 324) - ❌ ONLY USED HERE
└─> _processWithContinuationLoop (line 322, 373) - ❌ ONLY USED HERE
├─> _buildContinuationIterationPrompt (line 393, 459) - ❌ ONLY USED HERE
└─> processDocumentsPerChunkJsonWithPrompt (line 402) - ✅ USED ELSEWHERE
```
---
## Subfunction Analysis
### 1. `_buildContinuationPrompt`
**Location**: Line 324-371
**Status**: ✅ **USED** (but only internally)
**Called by**: `processDocumentsWithContinuation` (line 319)
**Effectively**: ❌ **UNUSED** (because parent function is unused)
**Internal Usage**:
- Called from `processDocumentsWithContinuation` at line 319
**Functionality**:
- Builds a prompt with continuation instructions
- Adds JSON structure requirements with `"continue": true/false` flag
- Adds `continuation_context` field specification
**Note**: This uses a different continuation pattern than `SubCoreAi._callAiWithLooping()`:
- This uses `"continue": true/false + "continuation_context"` for document sections
- SubCoreAi uses `buildContinuationContext()` with `last_raw_json`
---
### 2. `_processWithContinuationLoop`
**Location**: Line 373-457
**Status**: ✅ **USED** (but only internally)
**Called by**: `processDocumentsWithContinuation` (line 322)
**Effectively**: ❌ **UNUSED** (because parent function is unused)
**Internal Usage**:
- Called from `processDocumentsWithContinuation` at line 322
**External Dependencies**:
- Calls `self._buildContinuationIterationPrompt()` (line 393)
- Calls `self.processDocumentsPerChunkJsonWithPrompt()` (line 402)
**Functionality**:
- Implements continuation loop (max 10 iterations)
- Accumulates sections across iterations
- Checks `continue` flag and `continuation_context` to determine if more iterations needed
- Builds final result with accumulated sections
---
### 3. `_buildContinuationIterationPrompt`
**Location**: Line 459-498
**Status**: ✅ **USED** (but only internally)
**Called by**: `_processWithContinuationLoop` (line 393)
**Effectively**: ❌ **UNUSED** (because parent chain is unused)
**Internal Usage**:
- Called from `_processWithContinuationLoop` at line 393 (in loop, conditionally)
**Functionality**:
- Builds a prompt for continuation iteration with context
- Includes summary of previously generated content (last 3 sections)
- Includes continuation instructions with last section ID, element index, remaining requirements
---
### 4. `processDocumentsPerChunkJsonWithPrompt`
**Location**: Line 219-301
**Status**: ✅ **USED ELSEWHERE**
**Called by**:
- `_processWithContinuationLoop` (line 402)
- Also referenced in backup files (not active code)
**Internal Usage**:
- Called from `_processWithContinuationLoop` at line 402
**External Usage Search**:
- ✅ Used internally by continuation loop
- ⚠️ Referenced in `local/backup/backup_mainServiceAi.py.txt` (backup file, not active)
- ❌ Not used by any other active code
**Functionality**:
- Processes documents with per-chunk AI calls
- Uses a custom prompt instead of default extraction prompt
- Returns merged JSON document
**Note**: This function itself is only used by the continuation loop. However, it's a more general function that could be useful, so it's not "dead code" - it's just currently only used by unused code.
---
## Summary Table
| Function | Line | Status | Called By | Effectively Used? |
|----------|------|--------|-----------|-------------------|
| `processDocumentsWithContinuation` | 303 | ❌ Not used | (external) | ❌ No |
| `_buildContinuationPrompt` | 324 | ✅ Used internally | `processDocumentsWithContinuation:319` | ❌ No |
| `_processWithContinuationLoop` | 373 | ✅ Used internally | `processDocumentsWithContinuation:322` | ❌ No |
| `_buildContinuationIterationPrompt` | 459 | ✅ Used internally | `_processWithContinuationLoop:393` | ❌ No |
| `processDocumentsPerChunkJsonWithPrompt` | 219 | ✅ Used internally | `_processWithContinuationLoop:402` | ⚠️ **ONLY USED BY UNUSED CODE** |
---
## Current Active Implementation
The active continuation logic is in `subCoreAi.callAiDocuments()``_callAiWithLooping()`:
- Uses `buildGenerationPrompt()` with `continuationContext` parameter
- Uses `buildContinuationContext()` to build context from sections
- Different continuation pattern (uses `last_raw_json` instead of `continuation_context`)
---
## Dead Code Identification
**Completely Unused Chain** (can be safely removed):
1. ✅ `processDocumentsWithContinuation` - entry point, not called
2. ✅ `_buildContinuationPrompt` - only used by #1
3. ✅ `_processWithContinuationLoop` - only used by #1
4. ✅ `_buildContinuationIterationPrompt` - only used by #3
**Potentially Unused** (only used by dead code):
- ⚠️ `processDocumentsPerChunkJsonWithPrompt` - only caller is dead code, but function is general-purpose
---
## Recommendations
1. **Remove Dead Code Chain**: All four functions (`processDocumentsWithContinuation`, `_buildContinuationPrompt`, `_processWithContinuationLoop`, `_buildContinuationIterationPrompt`) can be safely removed.
2. **For `processDocumentsPerChunkJsonWithPrompt`**:
- **Option A**: Remove if not needed (it's only used by the dead continuation chain)
- **Option B**: Keep if it might be useful for future custom prompt processing
- **Recommendation**: Since it's a general-purpose function that could be useful, keep it but note that it's currently unused.
3. **If Keeping**: Document why this continuation logic exists but is unused, or mark as deprecated/legacy alternative to `_callAiWithLooping()`.
---
## Verification Commands
To verify these findings:
```bash
# Search for actual function calls (should return no results for the main function)
grep -r "\.processDocumentsWithContinuation(" gateway/ --exclude-dir=wiki --exclude-dir=local --exclude-dir=backup
# Search for _buildContinuationPrompt usage (should only find the definition)
grep -r "_buildContinuationPrompt" gateway/ --exclude-dir=wiki --exclude-dir=local --exclude-dir=backup --exclude="*.md"
# Search for _processWithContinuationLoop usage (should only find the definition)
grep -r "_processWithContinuationLoop" gateway/ --exclude-dir=wiki --exclude-dir=local --exclude-dir=backup --exclude="*.md"
```