From 259ccabbe35a2bfebc0eb5b836ded3f3c5e36864 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Fri, 31 Oct 2025 14:28:14 +0100
Subject: [PATCH] Prompt tuning for generation and validation step
---
analyze_naming_violations.py | 242 ------------
function_call_diagram.md | 254 ------------
modules/aicore/aicoreModelSelector.py | 30 +-
modules/datamodels/datamodelExtraction.py | 2 +-
modules/interfaces/interfaceAiObjects.py | 151 ++++---
modules/services/serviceAi/mainServiceAi.py | 170 +++-----
.../mainServiceExtraction.py | 8 +-
.../services/serviceExtraction/subPipeline.py | 109 +-----
.../renderers/rendererImage.py | 49 ++-
.../subPromptBuilderGeneration.py | 18 +-
modules/services/serviceWeb/mainServiceWeb.py | 8 +-
modules/shared/progressLogger.py | 22 +-
modules/workflows/methods/methodAi.py | 80 +---
.../processing/adaptive/VALIDATOR_ANALYSIS.md | 266 +++++++++++++
.../processing/adaptive/contentValidator.py | 370 ++++++++++++++----
.../workflows/processing/modes/modeReact.py | 10 +-
naming_violations_report.csv | 107 -----
...ocumentsWithContinuation_usage_analysis.md | 184 ---------
18 files changed, 821 insertions(+), 1259 deletions(-)
delete mode 100644 analyze_naming_violations.py
delete mode 100644 function_call_diagram.md
create mode 100644 modules/workflows/processing/adaptive/VALIDATOR_ANALYSIS.md
delete mode 100644 naming_violations_report.csv
delete mode 100644 processDocumentsWithContinuation_usage_analysis.md
diff --git a/analyze_naming_violations.py b/analyze_naming_violations.py
deleted file mode 100644
index a4f9b30f..00000000
--- a/analyze_naming_violations.py
+++ /dev/null
@@ -1,242 +0,0 @@
-"""
-Script to analyze codebase for snake_case naming violations that should be camelStyle.
-Excludes routes (decorated endpoint functions) and JSON field names.
-"""
-import ast
-import os
-import re
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Tuple
-import csv
-
-# Patterns to exclude (external library interfaces, etc.)
-EXCLUDE_PATTERNS = [
- r'@.*\.(get|post|put|delete|patch|options|head)', # FastAPI route decorators
- r'self\.(db|db_|model|orm)', # Database ORM attributes
- r'\.(objects|query|filter|get|all)', # ORM methods
- r'(request|response|response_model|status_code)', # FastAPI params
- r'(snake_case|kebab-case)', # String literals
-]
-
-# External library attribute patterns (should not be changed)
-EXTERNAL_LIB_ATTRIBUTES = {
- 'pydantic', 'fastapi', 'sqlalchemy', 'psycopg', 'requests',
- 'aiohttp', 'azure', 'google', 'openai', 'anthropic', 'reportlab',
- 'docx', 'pptx', 'openpyxl', 'json', 'logging', 'datetime', 'typing'
-}
-
-def isRouteFile(filePath: str) -> bool:
- """Check if file is a route file"""
- return 'routes' in filePath or 'route' in os.path.basename(filePath).lower()
-
-def shouldExcludeName(name: str, context: str = "") -> bool:
- """Check if a name should be excluded from analysis"""
- # Skip if it's a builtin or external library attribute
- if name.startswith('__') and name.endswith('__'):
- return True
-
- # Skip if context suggests external library usage
- for pattern in EXCLUDE_PATTERNS:
- if re.search(pattern, context, re.IGNORECASE):
- return True
-
- return False
-
-def isSnakeCase(name: str) -> bool:
- """Check if a name is snake_case"""
- if not name or name.startswith('_'):
- return False
- # Check if contains underscore and is not all caps
- return '_' in name and not name.isupper()
-
-def analyzeFile(filePath: str) -> Dict[str, List[str]]:
- """Analyze a Python file for naming violations"""
- violations = {
- 'functions': [],
- 'parameters': [],
- 'variables': []
- }
-
- try:
- with open(filePath, 'r', encoding='utf-8') as f:
- content = f.read()
- tree = ast.parse(content, filename=filePath)
- except (SyntaxError, UnicodeDecodeError):
- return violations
-
- # Track current context
- currentClass = None
- inRouteDecorator = False
-
- class NamingAnalyzer(ast.NodeVisitor):
- def __init__(self):
- self.violations = violations
- self.currentClass = None
- self.inRouteDecorator = False
- self.functionDefs = []
-
- def visit_FunctionDef(self, node):
- # Check if this is a route endpoint (has FastAPI decorator)
- isRouteEndpoint = False
- for decorator in node.decorator_list:
- if isinstance(decorator, ast.Attribute):
- if decorator.attr in ['get', 'post', 'put', 'delete', 'patch', 'options', 'head']:
- isRouteEndpoint = True
- break
- elif isinstance(decorator, ast.Call):
- if isinstance(decorator.func, ast.Attribute):
- if decorator.func.attr in ['get', 'post', 'put', 'delete', 'patch', 'options', 'head']:
- isRouteEndpoint = True
- break
-
- # Skip route endpoint function names
- # But we still need to check their parameters and variables
- funcName = node.name
- if not isRouteEndpoint and isSnakeCase(funcName) and not shouldExcludeName(funcName):
- self.violations['functions'].append(f"{funcName} (line {node.lineno})")
-
- # Analyze parameters
- for arg in node.args.args:
- if arg.arg != 'self' and arg.arg != 'cls':
- paramName = arg.arg
- if isSnakeCase(paramName) and not shouldExcludeName(paramName):
- self.violations['parameters'].append(f"{paramName} in {funcName} (line {node.lineno})")
-
- # Analyze function body for local variables
- for stmt in node.body:
- self.visit(stmt)
-
- def visit_ClassDef(self, node):
- oldClass = self.currentClass
- self.currentClass = node.name
- self.generic_visit(node)
- self.currentClass = oldClass
-
- def visit_Assign(self, node):
- for target in node.targets:
- if isinstance(target, ast.Name):
- varName = target.id
- # Skip constants (ALL_CAPS), builtins, and private (_xxx)
- if varName.isupper() or varName.startswith('_'):
- continue
- # Local variables should be camelStyle
- if isSnakeCase(varName) and not shouldExcludeName(varName):
- self.violations['variables'].append(f"{varName} (line {node.lineno})")
-
- def visit_For(self, node):
- if isinstance(node.target, ast.Name):
- varName = node.target.id
- if isSnakeCase(varName) and not shouldExcludeName(varName):
- self.violations['variables'].append(f"{varName} (line {node.lineno})")
- self.generic_visit(node)
-
- def visit_With(self, node):
- if node.items:
- for item in node.items:
- if item.optional_vars:
- if isinstance(item.optional_vars, ast.Name):
- varName = item.optional_vars.id
- if isSnakeCase(varName) and not shouldExcludeName(varName):
- self.violations['variables'].append(f"{varName} (line {node.lineno})")
- self.generic_visit(node)
-
- analyzer = NamingAnalyzer()
- analyzer.visit(tree)
-
- return violations
-
-def analyzeCodebase(rootDir: str = 'gateway') -> Dict[str, Dict[str, int]]:
- """Analyze entire codebase"""
- results = defaultdict(lambda: {
- 'functions': 0,
- 'parameters': 0,
- 'variables': 0,
- 'details': {
- 'functions': [],
- 'parameters': [],
- 'variables': []
- }
- })
-
- # Handle both absolute and relative paths
- rootPath = Path(rootDir)
- if not rootPath.exists():
- # Try relative to current directory
- rootPath = Path('.').resolve() / rootDir
- if not rootPath.exists():
- # Try just current directory if we're already in gateway
- rootPath = Path('.')
-
- # Find all Python files
- for pyFile in rootPath.rglob('*.py'):
- # Skip route files for function name analysis (but analyze their internals)
- filePath = str(pyFile.relative_to(rootPath))
-
- # Skip test files and special scripts
- if 'test' in filePath.lower() or 'tool_' in filePath or '__pycache__' in filePath:
- continue
-
- violations = analyzeFile(str(pyFile))
-
- # Check if there are any violations
- totalViolations = len(violations['functions']) + len(violations['parameters']) + len(violations['variables'])
- if totalViolations > 0:
- moduleName = filePath.replace('\\', '/')
- results[moduleName]['functions'] = len(violations['functions'])
- results[moduleName]['parameters'] = len(violations['parameters'])
- results[moduleName]['variables'] = len(violations['variables'])
- results[moduleName]['details'] = violations
-
- return results
-
-def generateCSV(results: Dict[str, Dict[str, int]], outputFile: str = 'naming_violations.csv'):
- """Generate CSV report"""
- with open(outputFile, 'w', newline='', encoding='utf-8') as f:
- writer = csv.writer(f)
- writer.writerow(['Module', 'Function Names', 'Parameter Names', 'Variable Names', 'Total'])
-
- # Sort by total violations
- sortedResults = sorted(
- results.items(),
- key=lambda x: x[1]['functions'] + x[1]['parameters'] + x[1]['variables'],
- reverse=True
- )
-
- rowsWritten = 0
- for module, stats in sortedResults:
- total = stats['functions'] + stats['parameters'] + stats['variables']
- if total > 0:
- writer.writerow([
- module,
- stats['functions'],
- stats['parameters'],
- stats['variables'],
- total
- ])
- rowsWritten += 1
-
- if rowsWritten == 0:
- print("WARNING: No rows written to CSV despite finding violations!")
-
- print(f"CSV report generated: {outputFile}")
- print(f"Total modules analyzed: {len(results)}")
-
- # Print summary
- totalFuncs = sum(r['functions'] for r in results.values())
- totalParams = sum(r['parameters'] for r in results.values())
- totalVars = sum(r['variables'] for r in results.values())
- print(f"\nSummary:")
- print(f" Function names: {totalFuncs}")
- print(f" Parameter names: {totalParams}")
- print(f" Variable names: {totalVars}")
- print(f" Total violations: {totalFuncs + totalParams + totalVars}")
-
-if __name__ == '__main__':
- print("Analyzing codebase for naming violations...")
- results = analyzeCodebase('gateway')
-
- # Write CSV to gateway directory
- outputPath = Path('gateway') / 'naming_violations_report.csv'
- generateCSV(results, str(outputPath))
-
diff --git a/function_call_diagram.md b/function_call_diagram.md
deleted file mode 100644
index 7ef22517..00000000
--- a/function_call_diagram.md
+++ /dev/null
@@ -1,254 +0,0 @@
-# Complete Function Call Diagram
-
-```mermaid
-graph TB
- subgraph AI_Service["AI Service Modules"]
- MA[mainServiceAi
AiService]
- SC[subCoreAi
SubCoreAi]
- SDG[subDocumentGeneration
SubDocumentGeneration]
- SDP[subDocumentProcessing
SubDocumentProcessing]
- SU[subSharedAiUtils
Utilities]
- end
-
- subgraph EXT_Service["Extraction Service Modules"]
- MSE[mainServiceExtraction
ExtractionService]
- SPE[subPromptBuilderExtraction
buildExtractionPrompt]
- SP[subPipeline
runExtraction]
- end
-
- subgraph GEN_Service["Generation Service Modules"]
- MSG[mainServiceGeneration
GenerationService]
- SPG[subPromptBuilderGeneration
buildGenerationPrompt]
- SJ[subJsonSchema
Schemas]
- end
-
- %% subCoreAi calls
- SC -->|_buildGenerationPrompt| SPG
- SC -->|callAiDocuments| SDP
- SC -->|sanitizePromptContent| SU
-
- %% subDocumentGeneration calls
- SDG -->|processDocumentsWithContinuation| SDP
- SDG -->|buildGenerationPrompt| SPG
- SDG -->|renderReport| MSG
- SDG -->|sanitizePromptContent| SU
-
- %% subDocumentProcessing calls
- SDP -->|extractContent 3x| MSE
- SDP -->|_applyMerging 3x| SP
- SDP -->|readImage| SC
-
- %% mainServiceExtraction calls
- MSE -->|runExtraction| SP
-
- %% subPromptBuilderExtraction calls
- SPE -->|get_document_subJsonSchema| SJ
- SPE -->|sanitizePromptContent| SU
-
- %% mainServiceGeneration calls utilities
- MSG -->|utility functions| SU
-
- %% subCoreAi detailed calls
- SC -.->|aiObjects.call| AI_Interface["AiObjects Interface"]
- SDP -.->|aiObjects.call| AI_Interface
-
- %% Style
- classDef aiClass fill:#e1f5ff,stroke:#0066cc,stroke-width:2px
- classDef extClass fill:#fff5e1,stroke:#cc6600,stroke-width:2px
- classDef genClass fill:#e1ffe1,stroke:#006600,stroke-width:2px
- classDef utilClass fill:#f0f0f0,stroke:#666,stroke-width:2px
- classDef interfaceClass fill:#ffe1f5,stroke:#cc0066,stroke-width:2px
-
- class MA,SC,SDG,SDP,SU aiClass
- class MSE,SPE,SP extClass
- class MSG,SPG,SJ genClass
- class AI_Interface interfaceClass
-```
-
-## Detailed Call Map with Function Names
-
-```mermaid
-graph LR
- %% Nodes
- SC[subCoreAi]
- SDG[subDocumentGeneration]
- SDP[subDocumentProcessing]
- SU[subSharedAiUtils]
- SPE[subPromptBuilderExtraction]
- SPG[subPromptBuilderGeneration]
- MSE[mainServiceExtraction]
- MSG[mainServiceGeneration]
- SP[subPipeline]
- SJ[subJsonSchema]
-
- %% subCoreAi function calls
- SC -->|"_buildGenerationPrompt()
calls"| SPG
- SC -->|"callAiDocuments()
calls callAiText()"| SDP
- SC -->|"sanitizePromptContent()"| SU
-
- %% subDocumentGeneration function calls
- SDG -->|"_processDocumentsUnified()
calls"| SDP
- SDG -->|"_processDocument()
calls"| SPG
- SDG -->|"_processDocument()
calls"| MSG
- SDG -->|"sanitizePromptContent()"| SU
-
- %% subDocumentProcessing function calls
- SDP -->|"extractContent()"| MSE
- SDP -->|"_mergePartResults()
_convertPartResultsToJson()
_mergeChunkResultsJson()
all call"| SP
- SDP -->|"_processChunksWithMapping()
calls readImage()"| SC
-
- %% Extraction service calls
- MSE -->|"extractContent()
calls"| SP
-
- %% Prompt builder calls
- SPE -->|"get_document_subJsonSchema()"| SJ
- SPE -->|"sanitizePromptContent()"| SU
-
- %% Generation service calls
- MSG -->|"uses utility functions"| SU
-
- classDef aiModule fill:#e1f5ff,stroke:#0066cc
- classDef extModule fill:#fff5e1,stroke:#cc6600
- classDef genModule fill:#e1ffe1,stroke:#006600
-
- class SC,SDG,SDP,SU aiModule
- class MSE,SPE,SP extModule
- class MSG,SPG,SJ genModule
-```
-
-## Call Flow by Module
-
-### 1. subCoreAi (SubCoreAi Class)
-**Calls Out:**
-- `buildGenerationPrompt()` → subPromptBuilderGeneration (line 363-366)
-- `callAiText()` → subDocumentProcessing (line 453)
-- `renderReport()` → mainServiceGeneration (line 478-482)
-- `sanitizePromptContent()` → subSharedAiUtils (line 61, via services.ai)
-
-**Called By:**
-- mainServiceAi (creates instance)
-- subDocumentProcessing._processChunksWithMapping (calls readImage at line 672-675)
-
----
-
-### 2. subDocumentGeneration (SubDocumentGeneration Class)
-**Calls Out:**
-- `processDocumentsWithContinuation()` → subDocumentProcessing (line 110)
-- `buildGenerationPrompt()` → subPromptBuilderGeneration (line 330)
-- `renderReport()` → mainServiceGeneration (line 392)
-- `sanitizePromptContent()` → subSharedAiUtils (line 466)
-
-**Called By:**
-- mainServiceAi (creates instance)
-
----
-
-### 3. subDocumentProcessing (SubDocumentProcessing Class)
-**Calls Out:**
-- `extractContent()` → mainServiceExtraction (lines 78, 131, 220)
-- `_applyMerging()` → subPipeline (lines 1044, 1095, 1232, 1293, 1345)
-- `readImage()` → subCoreAi (line 672-675)
-- `sanitizePromptContent()` → subSharedAiUtils (via self.services.ai)
-
-**Called By:**
-- mainServiceAi (creates instance)
-- subCoreAi.callAiDocuments (calls callAiText at line 453)
-- subDocumentGeneration._processDocumentsUnified (calls processDocumentsWithContinuation)
-
----
-
-### 4. mainServiceExtraction (ExtractionService Class)
-**Calls Out:**
-- `runExtraction()` → subPipeline (line 61)
-- Uses ExtractorRegistry from subRegistry
-
-**Called By:**
-- subDocumentProcessing.extractContent (3 times)
-
----
-
-### 5. subPromptBuilderExtraction
-**Calls Out:**
-- `get_document_subJsonSchema()` → subJsonSchema (line 172)
-- `sanitizePromptContent()` → subSharedAiUtils (via services.ai)
-
-**Called By:**
-- mainServiceGeneration (indirectly via getAdaptiveExtractionPrompt)
-
----
-
-### 6. mainServiceGeneration (GenerationService Class)
-**Calls Out:**
-- `get_renderer()` → renderers.registry (line 501)
-- Utility functions from subDocumentUtility
-- Uses modelRegistry (external)
-
-**Called By:**
-- subCoreAi.callAiDocuments (calls renderReport)
-- subDocumentGeneration._processDocument (calls renderReport)
-
----
-
-### 7. subPromptBuilderGeneration
-**Calls Out:**
-- Returns prompt template string
-
-**Called By:**
-- subCoreAi._buildGenerationPrompt (line 363-366)
-- subDocumentGeneration._processDocument (line 330)
-
----
-
-### 8. subPipeline
-**Calls Out:**
-- Creates IntelligentTokenAwareMerger from subMerger (line 96)
-- Uses mergers from merging submodules
-
-**Called By:**
-- mainServiceExtraction.extractContent (calls runExtraction)
-- subDocumentProcessing (calls _applyMerging 5 times)
-
----
-
-### 9. subSharedAiUtils
-**Functions Provided:**
-- `buildPromptWithPlaceholders()`
-- `sanitizePromptContent()`
-- `extractTextFromContentParts()`
-- `reduceText()`
-- `determineCallType()`
-
-**Called By:**
-- subCoreAi (imports and calls functions)
-- subDocumentGeneration (via services.ai.sanitizePromptContent)
-- subPromptBuilderExtraction (via services.ai.sanitizePromptContent)
-
----
-
-### 10. subJsonSchema
-**Functions Provided:**
-- `get_document_subJsonSchema()`
-- `get_multi_document_subJsonSchema()`
-
-**Called By:**
-- subPromptBuilderExtraction.buildExtractionPrompt (line 172)
-
----
-
-## Circular Dependencies
-
-**AI Service Loop:**
-1. subDocumentProcessing → subCoreAi.readImage() (for image processing)
-2. subDocumentProcessing → mainServiceExtraction (for extraction)
-3. mainServiceExtraction → subPipeline (for processing)
-4. subPipeline creates IntelligentTokenAwareMerger
-
-**Flow:**
-```
-subDocumentProcessing.extractContent()
- → mainServiceExtraction.extractContent()
- → subPipeline.runExtraction()
- → returns ContentExtracted
- → processed by subDocumentProcessing
- → calls subPipeline._applyMerging()
-```
diff --git a/modules/aicore/aicoreModelSelector.py b/modules/aicore/aicoreModelSelector.py
index dfc5e118..e1961fa0 100644
--- a/modules/aicore/aicoreModelSelector.py
+++ b/modules/aicore/aicoreModelSelector.py
@@ -70,6 +70,12 @@ class ModelSelector:
promptSize = len(prompt.encode("utf-8"))
contextSize = len(context.encode("utf-8"))
totalSize = promptSize + contextSize
+ # Convert bytes to approximate tokens (1 token ≈ 4 bytes)
+ promptTokens = promptSize / 4
+ contextTokens = contextSize / 4
+ totalTokens = totalSize / 4
+
+ logger.debug(f"Request sizes - Prompt: {promptTokens:.0f} tokens ({promptSize} bytes), Context: {contextTokens:.0f} tokens ({contextSize} bytes), Total: {totalTokens:.0f} tokens ({totalSize} bytes)")
# Step 1: Filter by operation type (MUST match) - check if model has this operation type
operationFiltered = []
@@ -80,10 +86,32 @@ class ModelSelector:
operationFiltered.append(model)
logger.debug(f"After operation type filtering: {len(operationFiltered)} models")
+ if operationFiltered:
+ logger.debug(f"Models with {options.operationType.value}: {[m.name for m in operationFiltered]}")
+
# Step 2: Filter by prompt size (MUST be <= 80% of context size)
- promptFiltered = [m for m in operationFiltered if m.contextLength == 0 or promptSize <= m.contextLength * 0.8]
+ # Note: contextLength is in tokens, so we need to compare tokens with tokens
+ promptFiltered = []
+ for model in operationFiltered:
+ if model.contextLength == 0:
+ # No context length limit - always pass
+ promptFiltered.append(model)
+ else:
+ maxAllowedTokens = model.contextLength * 0.8
+ # Compare prompt tokens (not bytes) with model's token limit
+ if promptTokens <= maxAllowedTokens:
+ promptFiltered.append(model)
+ else:
+ logger.debug(f"Model {model.name} filtered out: promptSize={promptTokens:.0f} tokens > maxAllowed={maxAllowedTokens:.0f} tokens (80% of {model.contextLength} tokens)")
+
logger.debug(f"After prompt size filtering: {len(promptFiltered)} models")
+ if not promptFiltered and operationFiltered:
+ logger.warning(f"All {len(operationFiltered)} models with {options.operationType.value} were filtered out due to prompt size. Prompt: {promptTokens:.0f} tokens. Available models:")
+ for model in operationFiltered:
+ maxAllowed = model.contextLength * 0.8 / 4 if model.contextLength > 0 else "unlimited"
+ logger.warning(f" - {model.name}: contextLength={model.contextLength} tokens, maxAllowed={maxAllowed} tokens")
+
# Step 3: Calculate scores for each model
scoredModels = []
for model in promptFiltered:
diff --git a/modules/datamodels/datamodelExtraction.py b/modules/datamodels/datamodelExtraction.py
index 242d413a..5a530cab 100644
--- a/modules/datamodels/datamodelExtraction.py
+++ b/modules/datamodels/datamodelExtraction.py
@@ -88,4 +88,4 @@ class ExtractionOptions(BaseModel):
maxConcurrentChunks: int = Field(default=5, ge=1, le=20, description="Maximum number of chunks to process concurrently")
class Config:
- arbitraryTypesAllowed = True # Allow OperationTypeEnum import
+ arbitraryTypesAllowed = True # Allow OperationTypeEnum import
\ No newline at end of file
diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py
index c6b0e62c..f41f0518 100644
--- a/modules/interfaces/interfaceAiObjects.py
+++ b/modules/interfaces/interfaceAiObjects.py
@@ -19,7 +19,7 @@ from modules.datamodels.datamodelAi import (
AiModelCall,
AiModelResponse,
)
-from modules.datamodels.datamodelExtraction import ContentPart
+from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
# Dynamic model registry - models are now loaded from connectors via aicore system
@@ -73,7 +73,8 @@ class AiObjects:
logger.info(f"Selected model: {selectedModel.name} ({selectedModel.displayName})")
return selectedModel.name
- # AI for Extraction and Text Generation
+
+ # AI for Extraction, Processing, Generation
async def call(self, request: AiCallRequest) -> AiCallResponse:
"""Call AI model for text generation with model-aware chunking."""
# Handle content parts (unified path)
@@ -366,7 +367,6 @@ class AiObjects:
content_parts.append(content_part)
# Use existing merging system
- from modules.datamodels.datamodelExtraction import MergeStrategy
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="typeGroup",
@@ -374,8 +374,7 @@ class AiObjects:
mergeType="concatenate"
)
- from modules.services.serviceExtraction.subPipeline import _applyMerging
- merged_parts = _applyMerging(content_parts, merge_strategy)
+ merged_parts = applyMerging(content_parts, merge_strategy)
# Convert merged parts back to final string
final_content = "\n\n".join([part.data for part in merged_parts])
@@ -413,7 +412,6 @@ class AiObjects:
content_parts.append(content_part)
# Use existing merging system
- from modules.datamodels.datamodelExtraction import MergeStrategy
merge_strategy = MergeStrategy(
useIntelligentMerging=True,
groupBy="typeGroup",
@@ -421,8 +419,7 @@ class AiObjects:
mergeType="concatenate"
)
- from modules.services.serviceExtraction.subPipeline import _applyMerging
- merged_parts = _applyMerging(content_parts, merge_strategy)
+ merged_parts = applyMerging(content_parts, merge_strategy)
# Convert merged parts back to final string
final_content = "\n\n".join([part.data for part in merged_parts])
@@ -505,82 +502,6 @@ class AiObjects:
)
- # AI for Image Generation
- async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> AiCallResponse:
- """Generate an image using AI."""
-
- if options is None:
- options = AiCallOptions(operationType=OperationTypeEnum.IMAGE_GENERATE)
-
- # Calculate input bytes
- inputBytes = len(prompt.encode("utf-8"))
-
- try:
- # Select the best model for image generation
- modelName = self._selectModel(prompt, "", options)
- selectedModel = modelRegistry.getModel(modelName)
-
- if not selectedModel:
- raise ValueError(f"Selected model {modelName} not found in registry")
-
- # Get the connector for this model
- connector = modelRegistry.getConnectorForModel(modelName)
- if not connector:
- raise ValueError(f"No connector found for model {modelName}")
-
- # Start timing
- startTime = time.time()
-
- # Create standardized call object for image generation
- modelCall = AiModelCall(
- messages=[{"role": "user", "content": prompt}],
- model=selectedModel,
- options=AiCallOptions(size=size, quality=quality, style=style)
- )
-
- # Call the model with standardized interface
- if selectedModel.functionCall:
- modelResponse = await selectedModel.functionCall(modelCall)
-
- # Extract content from standardized response
- if not modelResponse.success:
- raise ValueError(f"Model call failed: {modelResponse.error}")
- content = modelResponse.content
- else:
- raise ValueError(f"Model {modelName} has no function call defined")
-
- # Calculate timing and output bytes
- endTime = time.time()
- processingTime = endTime - startTime
- outputBytes = len(content.encode("utf-8"))
-
- # Calculate price using model's own price calculation method
- priceUsd = selectedModel.calculatePriceUsd(processingTime, inputBytes, outputBytes)
-
- logger.info(f"✅ Image generation successful with model: {modelName}")
- return AiCallResponse(
- success=True,
- content=content,
- modelName=modelName,
- processingTime=processingTime,
- priceUsd=priceUsd,
- bytesSent=inputBytes,
- bytesReceived=outputBytes,
- errorCount=0
- )
-
- except Exception as e:
- logger.error(f"❌ Image generation failed with model {modelName}: {str(e)}")
- return AiCallResponse(
- content=f"Image generation failed: {str(e)}",
- modelName=modelName,
- priceUsd=0.0,
- processingTime=0.0,
- bytesSent=inputBytes,
- bytesReceived=0,
- errorCount=1
- )
-
# Utility methods
async def listAvailableModels(self, connectorType: str = None) -> List[Dict[str, Any]]:
"""List available models, optionally filtered by connector type."""
@@ -601,3 +522,65 @@ class AiObjects:
models = modelRegistry.getModelsByTag(tag)
return [model.name for model in models]
+
+def applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
+ """Apply merging strategy to parts with intelligent token-aware merging."""
+ logger.debug(f"applyMerging called with {len(parts)} parts")
+
+ # Import merging dependencies
+ from modules.services.serviceExtraction.merging.mergerText import TextMerger
+ from modules.services.serviceExtraction.merging.mergerTable import TableMerger
+ from modules.services.serviceExtraction.merging.mergerDefault import DefaultMerger
+ from modules.services.serviceExtraction.subMerger import IntelligentTokenAwareMerger
+
+ # Check if intelligent merging is enabled
+ if strategy.useIntelligentMerging:
+ modelCapabilities = strategy.capabilities or {}
+ subMerger = IntelligentTokenAwareMerger(modelCapabilities)
+
+ # Use intelligent merging for all parts
+ merged = subMerger.mergeChunksIntelligently(parts, strategy.prompt or "")
+
+ # Calculate and log optimization stats
+ stats = subMerger.calculateOptimizationStats(parts, merged)
+ logger.info(f"🧠 Intelligent merging stats: {stats}")
+ logger.debug(f"Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
+
+ return merged
+
+ # Fallback to traditional merging
+ textMerger = TextMerger()
+ tableMerger = TableMerger()
+ defaultMerger = DefaultMerger()
+
+ # Group by typeGroup
+ textParts = [p for p in parts if p.typeGroup == "text"]
+ tableParts = [p for p in parts if p.typeGroup == "table"]
+ structureParts = [p for p in parts if p.typeGroup == "structure"]
+ otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
+
+ logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
+
+ merged: List[ContentPart] = []
+
+ if textParts:
+ textMerged = textMerger.merge(textParts, strategy)
+ logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
+ merged.extend(textMerged)
+ if tableParts:
+ tableMerged = tableMerger.merge(tableParts, strategy)
+ logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
+ merged.extend(tableMerged)
+ if structureParts:
+ # For now, treat structure like text
+ structureMerged = textMerger.merge(structureParts, strategy)
+ logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
+ merged.extend(structureMerged)
+ if otherParts:
+ otherMerged = defaultMerger.merge(otherParts, strategy)
+ logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
+ merged.extend(otherMerged)
+
+ logger.debug(f"applyMerging returning {len(merged)} parts")
+ return merged
+
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index 381b87fa..bd9919d5 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -302,6 +302,7 @@ Respond with ONLY a JSON object in this exact format:
# Done - build final result
if operationId:
self.services.workflow.progressLogUpdate(operationId, 0.95, f"Generation complete ({iteration} iterations, {len(allSections)} sections)")
+ logger.info(f"Generation complete after {iteration} iterations: {len(allSections)} sections")
break
except Exception as e:
@@ -385,7 +386,7 @@ Respond with ONLY a JSON object in this exact format:
rawResponse: str = None
) -> bool:
"""
- Determine if generation should continue based on JSON completeness and complete_response flag.
+ Determine if generation should continue based on JSON completeness, complete_response flag, and task completion.
Returns True if we should continue, False if done.
"""
if len(allSections) == 0:
@@ -395,13 +396,22 @@ Respond with ONLY a JSON object in this exact format:
if rawResponse:
import re
if re.search(r'"complete_response"\s*:\s*true', rawResponse, re.IGNORECASE):
+ logger.info(f"Iteration {iteration}: AI marked response as complete (complete_response flag detected)")
return False
- # If JSON was complete (and no complete_response flag), we're done
- # If JSON was broken and repaired, continue to get more content
+ # If JSON was complete, stop (AI should have set complete_response if task is done)
+ # For continuation iterations (iteration > 1), if JSON is complete but no flag was set,
+ # stop to prevent infinite loops - AI had a chance to set the flag
if wasJsonComplete:
+ if iteration > 1:
+ # Continuation mode: JSON complete without flag means we're likely done
+ # Stop to prevent infinite loops
+ logger.info(f"Iteration {iteration}: JSON complete without complete_response flag - stopping")
+ return False
+ # First iteration with complete JSON - done
return False
else:
+ # JSON was incomplete/broken - continue
return True
def _buildFinalResultFromSections(
@@ -523,25 +533,61 @@ Respond with ONLY a JSON object in this exact format:
)
try:
- # Ensure AI connectors are initialized before delegating to generator
- if hasattr(self.services, 'ai') and hasattr(self.services.ai, '_ensureAiObjectsInitialized'):
- await self.services.ai._ensureAiObjectsInitialized()
if options is None or (hasattr(options, 'operationType') and options.operationType is None):
# Use AI to determine parameters ONLY when truly needed (options=None OR operationType=None)
self.services.workflow.progressLogUpdate(aiOperationId, 0.1, "Analyzing prompt parameters")
options = await self._analyzePromptAndCreateOptions(prompt)
- # Route image-generation requests directly to image pipeline to avoid JSON loop
- imgFormats = {"png", "jpg", "jpeg", "webp", "image", "base64"}
+ # Handle image generation requests directly via generic path
opType = getattr(options, "operationType", None)
- fmt = (outputFormat or "").lower() if outputFormat else None
- isImageRequest = (opType == OperationTypeEnum.IMAGE_GENERATE) or (fmt in imgFormats)
+ isImageRequest = (opType == OperationTypeEnum.IMAGE_GENERATE)
+
if isImageRequest:
+ # Image generation uses generic call path but bypasses document generation pipeline
self.services.workflow.progressLogUpdate(aiOperationId, 0.4, "Calling AI for image generation")
- imageResponse = await self.generateImage(prompt, options=options)
- self.services.workflow.progressLogUpdate(aiOperationId, 0.9, "Image generated")
- self.services.workflow.progressLogFinish(aiOperationId, True)
- return imageResponse
+
+ # Call via generic path (no looping for images)
+ request = AiCallRequest(
+ prompt=prompt,
+ context="",
+ options=options
+ )
+
+ response = await self.aiObjects.call(request)
+
+ # Extract image data from response
+ if response.content:
+ # For base64 format, return in expected format
+ if outputFormat == "base64":
+ result = {
+ "success": True,
+ "image_data": response.content,
+ "documents": [{
+ "documentName": "generated_image.png",
+ "documentData": response.content,
+ "mimeType": "image/png",
+ "title": title or "Generated Image"
+ }]
+ }
+ else:
+ # Return raw content for other formats
+ result = response.content
+
+ # Emit stats for image generation
+ self.services.workflow.storeWorkflowStat(
+ self.services.currentWorkflow,
+ response,
+ f"ai.generate.image"
+ )
+
+ self.services.workflow.progressLogUpdate(aiOperationId, 0.9, "Image generated")
+ self.services.workflow.progressLogFinish(aiOperationId, True)
+ return result
+ else:
+ errorMsg = f"No image data returned: {response.content}"
+ logger.error(f"Error in AI image generation: {errorMsg}")
+ self.services.workflow.progressLogFinish(aiOperationId, False)
+ return {"success": False, "error": errorMsg}
# CRITICAL: For document generation with JSON templates, NEVER compress the prompt
# Compressing would truncate the template structure and confuse the AI
@@ -656,102 +702,6 @@ Respond with ONLY a JSON object in this exact format:
self.services.workflow.progressLogFinish(aiOperationId, False)
raise
- # AI Image Analysis
- async def readImage(
- self,
- prompt: str,
- imageData: Union[str, bytes],
- mimeType: str = None,
- options: Optional[AiCallOptions] = None,
- ) -> str:
- """Call AI for image analysis using interface.call() with contentParts."""
- await self._ensureAiObjectsInitialized()
-
- try:
- # Check if imageData is valid
- if not imageData:
- error_msg = "No image data provided"
- logger.error(f"Error in AI image analysis: {error_msg}")
- return f"Error: {error_msg}"
-
-
- # Always use IMAGE_ANALYSE operation type for image processing
- if options is None:
- options = AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE)
- else:
- # Override the operation type to ensure image analysis
- options.operationType = OperationTypeEnum.IMAGE_ANALYSE
-
- # Create content parts with image data
- from modules.datamodels.datamodelExtraction import ContentPart
- import base64
-
- # ContentPart.data must be a string - convert bytes to base64 if needed
- if isinstance(imageData, bytes):
- imageDataStr = base64.b64encode(imageData).decode('utf-8')
- else:
- # Already a base64 string
- imageDataStr = imageData
-
- imagePart = ContentPart(
- id="image_0",
- parentId=None,
- label="Image",
- typeGroup="image",
- mimeType=mimeType or "image/jpeg",
- data=imageDataStr, # Must be a string (base64 encoded)
- metadata={"imageAnalysis": True}
- )
-
- # Create request with content parts
- request = AiCallRequest(
- prompt=prompt,
- context="",
- options=options,
- contentParts=[imagePart]
- )
-
- response = await self.aiObjects.call(request)
- result = response.content
-
- # Check if result is valid
- if not result or (isinstance(result, str) and not result.strip()):
- error_msg = f"No response from AI image analysis (result: {repr(result)})"
- logger.error(f"Error in AI image analysis: {error_msg}")
- return f"Error: {error_msg}"
-
- return result
- except Exception as e:
- logger.error(f"Error in AI image analysis: {str(e)}")
- return f"Error: {str(e)}"
-
- # AI Image Generation
- async def generateImage(
- self,
- prompt: str,
- size: str = "1024x1024",
- quality: str = "standard",
- style: str = "vivid",
- options: Optional[AiCallOptions] = None,
- ) -> Dict[str, Any]:
- """Generate an image using AI using interface.generateImage()."""
- await self._ensureAiObjectsInitialized()
-
- try:
- response = await self.aiObjects.generateImage(prompt, size, quality, style, options)
-
- # Emit stats for image generation
- self.services.workflow.storeWorkflowStat(
- self.services.currentWorkflow,
- response,
- f"ai.generate.image"
- )
-
- return response
- except Exception as e:
- logger.error(f"Error in AI image generation: {str(e)}")
- return {"success": False, "error": str(e)}
-
async def callAiText(
self,
prompt: str,
diff --git a/modules/services/serviceExtraction/mainServiceExtraction.py b/modules/services/serviceExtraction/mainServiceExtraction.py
index b61cd1cc..5d5079f1 100644
--- a/modules/services/serviceExtraction/mainServiceExtraction.py
+++ b/modules/services/serviceExtraction/mainServiceExtraction.py
@@ -5,7 +5,7 @@ import time
import asyncio
from .subRegistry import ExtractorRegistry, ChunkerRegistry
-from .subPipeline import runExtraction, _applyMerging
+from .subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions, PartResult
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallResponse, AiCallRequest, AiCallOptions, OperationTypeEnum
@@ -14,6 +14,9 @@ from modules.aicore.aicoreModelRegistry import modelRegistry
logger = logging.getLogger(__name__)
+# Rebuild ExtractionOptions to resolve forward references after all imports are complete
+ExtractionOptions.model_rebuild()
+
class ExtractionService:
def __init__(self, services: Optional[Any] = None):
@@ -649,7 +652,8 @@ class ExtractionService:
# Apply existing merging logic using the sophisticated merging system
- merged_parts = _applyMerging(content_parts, merge_strategy)
+ from modules.interfaces.interfaceAiObjects import applyMerging
+ merged_parts = applyMerging(content_parts, merge_strategy)
# Convert merged parts back to final string
final_content = "\n\n".join([part.data for part in merged_parts])
diff --git a/modules/services/serviceExtraction/subPipeline.py b/modules/services/serviceExtraction/subPipeline.py
index f36afe8e..510bcca8 100644
--- a/modules/services/serviceExtraction/subPipeline.py
+++ b/modules/services/serviceExtraction/subPipeline.py
@@ -4,55 +4,11 @@ import logging
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy
from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry
-from .merging.mergerText import TextMerger
-from .merging.mergerTable import TableMerger
-from .merging.mergerDefault import DefaultMerger
-from .subMerger import IntelligentTokenAwareMerger
logger = logging.getLogger(__name__)
-def _mergeParts(parts: List[ContentPart], mergeStrategy: MergeStrategy) -> List[ContentPart]:
- """Merge parts based on the provided strategy."""
- if not parts or not mergeStrategy:
- return parts
-
- groupBy = mergeStrategy.groupBy
- orderBy = mergeStrategy.orderBy
-
- # Group parts by the specified field
- groups = {}
- for part in parts:
- key = getattr(part, groupBy, "unknown")
- if key not in groups:
- groups[key] = []
- groups[key].append(part)
-
- # Merge each group
- merged_parts = []
- for group_key, group_parts in groups.items():
- if len(group_parts) == 1:
- merged_parts.extend(group_parts)
- else:
- # Sort by orderBy field if specified
- if orderBy:
- group_parts.sort(key=lambda p: getattr(p, orderBy, ""))
-
- # Use appropriate merger based on type
- type_group = group_parts[0].typeGroup if group_parts else "unknown"
-
- if type_group == "text":
- merger = TextMerger()
- elif type_group == "table":
- merger = TableMerger()
- else:
- merger = DefaultMerger()
-
- # Merge the group
- merged = merger.merge(group_parts, mergeStrategy)
- merged_parts.extend(merged)
-
- return merged_parts
+# REMOVED: _mergeParts function - unused, functionality replaced by applyMerging in interfaceAiObjects.py
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted:
@@ -78,69 +34,12 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
# Apply merging strategy if provided (preserve existing logic)
if options.mergeStrategy:
- parts = _applyMerging(parts, options.mergeStrategy)
+ from modules.interfaces.interfaceAiObjects import applyMerging
+ parts = applyMerging(parts, options.mergeStrategy)
return ContentExtracted(id=makeId(), parts=parts)
# REMOVED: poolAndLimit function - chunking now handled in AI call phase
-
-
-def _applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
- """Apply merging strategy to parts with intelligent token-aware merging."""
- logger.debug(f"_applyMerging called with {len(parts)} parts")
-
- # Check if intelligent merging is enabled
- if strategy.useIntelligentMerging:
- model_capabilities = strategy.capabilities or {}
- subMerger = IntelligentTokenAwareMerger(model_capabilities)
-
- # Use intelligent merging for all parts
- merged = subMerger.mergeChunksIntelligently(parts, strategy.prompt or "")
-
- # Calculate and log optimization stats
- stats = subMerger.calculateOptimizationStats(parts, merged)
- logger.info(f"🧠 Intelligent merging stats: {stats}")
- logger.debug(f"Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
-
- return merged
-
- # Fallback to traditional merging
- textMerger = TextMerger()
- tableMerger = TableMerger()
- defaultMerger = DefaultMerger()
-
- # Group by typeGroup
- textParts = [p for p in parts if p.typeGroup == "text"]
- tableParts = [p for p in parts if p.typeGroup == "table"]
- structureParts = [p for p in parts if p.typeGroup == "structure"]
- otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
-
- logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
-
- merged: List[ContentPart] = []
-
- if textParts:
- textMerged = textMerger.merge(textParts, strategy)
- logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
- merged.extend(textMerged)
- if tableParts:
- tableMerged = tableMerger.merge(tableParts, strategy)
- logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
- merged.extend(tableMerged)
- if structureParts:
- # For now, treat structure like text
- structureMerged = textMerger.merge(structureParts, strategy)
- logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
- merged.extend(structureMerged)
- if otherParts:
- otherMerged = defaultMerger.merge(otherParts, strategy)
- logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
- merged.extend(otherMerged)
-
- logger.debug(f"_applyMerging returning {len(merged)} parts")
- return merged
-
-
-# REMOVED: _applySizeLimit function - no longer needed after removing poolAndLimit
+# REMOVED: applyMerging function - moved to interfaceAiObjects.py for proper interface-level access
diff --git a/modules/services/serviceGeneration/renderers/rendererImage.py b/modules/services/serviceGeneration/renderers/rendererImage.py
index bfc89927..71ef41b1 100644
--- a/modules/services/serviceGeneration/renderers/rendererImage.py
+++ b/modules/services/serviceGeneration/renderers/rendererImage.py
@@ -61,27 +61,58 @@ class RendererImage(BaseRenderer):
# Save image generation prompt to debug
aiService.services.utils.writeDebugFile(imagePrompt, "image_generation_prompt")
- # Generate image using AI
- imageResult = await aiService.aiObjects.generateImage(
+ # Format prompt as JSON with image generation parameters
+ from modules.datamodels.datamodelAi import AiCallPromptImage, AiCallOptions, OperationTypeEnum
+ import json
+
+ promptModel = AiCallPromptImage(
prompt=imagePrompt,
size="1024x1024",
quality="standard",
style="vivid"
)
+ promptJson = promptModel.model_dump_json(exclude_none=True, indent=2)
+
+ # Use generic path via callAiDocuments
+ options = AiCallOptions(
+ operationType=OperationTypeEnum.IMAGE_GENERATE,
+ resultFormat="base64"
+ )
+
+ # Call via generic path
+ imageResult = await aiService.callAiDocuments(
+ prompt=promptJson,
+ documents=None,
+ options=options,
+ outputFormat="base64"
+ )
# Save image generation response to debug
aiService.services.utils.writeDebugFile(str(imageResult), "image_generation_response")
# Extract base64 image data from result
- if imageResult and imageResult.get("success", False):
- imageData = imageResult.get("image_data", "")
- if imageData:
- return imageData
- else:
+ # The generic path returns a dict with documents array for base64 format
+ if isinstance(imageResult, dict):
+ if imageResult.get("success", False):
+ # Check if it's the new format with documents array
+ documents = imageResult.get("documents", [])
+ if documents and len(documents) > 0:
+ imageData = documents[0].get("documentData", "")
+ if imageData:
+ return imageData
+ # Fallback: check for image_data field
+ imageData = imageResult.get("image_data", "")
+ if imageData:
+ return imageData
raise ValueError("No image data returned from AI")
+ else:
+ errorMsg = imageResult.get("error", "Unknown error")
+ raise ValueError(f"AI image generation failed: {errorMsg}")
+ elif isinstance(imageResult, str):
+ # If it's just a string, it might be base64 data directly
+ return imageResult
else:
- errorMsg = imageResult.get("error", "Unknown error") if imageResult else "No result"
- raise ValueError(f"AI image generation failed: {errorMsg}")
+ raise ValueError(f"Unexpected image generation result format: {type(imageResult)}")
except Exception as e:
self.logger.error(f"Error generating AI image: {str(e)}")
diff --git a/modules/services/serviceGeneration/subPromptBuilderGeneration.py b/modules/services/serviceGeneration/subPromptBuilderGeneration.py
index 055bf397..a2a98f66 100644
--- a/modules/services/serviceGeneration/subPromptBuilderGeneration.py
+++ b/modules/services/serviceGeneration/subPromptBuilderGeneration.py
@@ -93,10 +93,10 @@ Instructions:
- Arrays must contain ONLY JSON values; do not include comments or ellipses.
- Use ONLY the element structures shown in the template.
- Continue from where it stopped — add NEW items only; do not repeat existing items.
-- Generate all remaining content needed to complete the user request.
+- Generate remaining content to complete the user request.
- Fill with actual content (no placeholders or instructional text such as "Add more...").
-- When fully complete, add "complete_response": true at root level.
-- Output JSON only; no markdown fences or extra text before/after.
+- When the request is fully satisfied, add "complete_response": true at root level.
+- Output JSON only; no markdown fences or extra text.
Continue generating:
"""
@@ -110,14 +110,12 @@ JSON structure template:
{jsonTemplate}
Instructions:
-- Start your response with {{"metadata": ...}} — return COMPLETE, STRICT JSON.
-- Return ONLY valid JSON (strict). No comments of any kind (no //, /* */, or #). No trailing commas. Strings must use double quotes.
-- Arrays must contain ONLY JSON values; do not include comments or ellipses.
-- Do NOT reuse the example section IDs from the template; create your own.
-- Use ONLY the element structures shown in the template.
+- Start with {{"metadata": ...}} — return COMPLETE, STRICT JSON.
+- Return ONLY valid JSON (strict). No comments. No trailing commas. Use double quotes.
+- Do NOT reuse example section IDs; create your own.
- Generate complete content based on the user request.
-- When fully complete, add "complete_response": true at root level.
-- Output JSON only; no markdown fences or any additional text.
+- When the request is fully satisfied, add "complete_response": true at root level.
+- Output JSON only; no markdown fences or extra text.
Generate your complete response starting from {{"metadata": ...}}:
"""
diff --git a/modules/services/serviceWeb/mainServiceWeb.py b/modules/services/serviceWeb/mainServiceWeb.py
index 3e43da4a..e34d8b96 100644
--- a/modules/services/serviceWeb/mainServiceWeb.py
+++ b/modules/services/serviceWeb/mainServiceWeb.py
@@ -26,7 +26,7 @@ class WebService:
language: Optional[str],
researchDepth: str = "general",
operationId: str = None
- ) -> Dict[str, Any]:
+ ) -> Dict[str, Any]:
"""
Perform web research in two steps:
1. Use AI to analyze prompt and extract parameters + URLs
@@ -128,7 +128,7 @@ class WebService:
country: Optional[str],
language: Optional[str],
researchDepth: str = "general"
- ) -> Dict[str, Any]:
+ ) -> Dict[str, Any]:
"""
Use AI to analyze prompt and extract:
- URLs from the prompt text
@@ -195,7 +195,7 @@ Return ONLY valid JSON, no additional text:
maxNumberPages: int,
country: Optional[str],
language: Optional[str]
- ) -> List[str]:
+ ) -> List[str]:
"""Perform web search to find URLs."""
try:
# Build search prompt model
@@ -248,7 +248,7 @@ Return ONLY valid JSON, no additional text:
instruction: str,
urls: List[str],
maxDepth: int = 2
- ) -> List[Dict[str, Any]]:
+ ) -> List[Dict[str, Any]]:
"""Perform web crawl on list of URLs - calls plugin for each URL individually."""
crawlResults = []
diff --git a/modules/shared/progressLogger.py b/modules/shared/progressLogger.py
index 09a66bab..e6ecdee7 100644
--- a/modules/shared/progressLogger.py
+++ b/modules/shared/progressLogger.py
@@ -23,6 +23,7 @@ class ProgressLogger:
self.workflowService = workflowService
self.workflow = workflow
self.activeOperations = {}
+ self.finishedOperations = set() # Track finished operations to avoid repeated warnings
def startOperation(self, operationId: str, serviceName: str, actionName: str, context: str = ""):
"""Start a new long-running operation.
@@ -33,6 +34,9 @@ class ProgressLogger:
actionName: Name of the action being performed
context: Additional context information
"""
+ # Remove from finished operations if it was there (for restart scenarios)
+ self.finishedOperations.discard(operationId)
+
self.activeOperations[operationId] = {
'service': serviceName,
'action': actionName,
@@ -51,7 +55,14 @@ class ProgressLogger:
statusUpdate: Additional status information
"""
if operationId not in self.activeOperations:
- logger.warning(f"Operation {operationId} not found for progress update")
+ # Only warn once per operation ID if it was already finished
+ if operationId in self.finishedOperations:
+ # Operation already finished - silently ignore subsequent updates
+ return
+ else:
+ # Operation never started - log warning once and mark as problematic
+ logger.warning(f"Operation {operationId} not found for progress update (operation never started)")
+ self.finishedOperations.add(operationId) # Prevent repeated warnings
return
op = self.activeOperations[operationId]
@@ -67,7 +78,10 @@ class ProgressLogger:
success: Whether the operation completed successfully
"""
if operationId not in self.activeOperations:
- logger.warning(f"Operation {operationId} not found for completion")
+ # Only warn once if operation was already finished
+ if operationId not in self.finishedOperations:
+ logger.warning(f"Operation {operationId} not found for completion (operation never started or already finished)")
+ self.finishedOperations.add(operationId)
return
op = self.activeOperations[operationId]
@@ -83,6 +97,9 @@ class ProgressLogger:
# Remove from active operations AFTER creating the log
del self.activeOperations[operationId]
+
+ # Mark as finished to prevent repeated warnings from updateOperation calls
+ self.finishedOperations.add(operationId)
def _logProgress(self, operationId: str, progress: float, status: str):
"""Create standardized ChatLog entry.
@@ -122,4 +139,5 @@ class ProgressLogger:
def clearAllOperations(self):
"""Clear all active operations (for cleanup)."""
self.activeOperations.clear()
+ self.finishedOperations.clear()
logger.debug("Cleared all active operations")
diff --git a/modules/workflows/methods/methodAi.py b/modules/workflows/methods/methodAi.py
index c2250c59..60eab727 100644
--- a/modules/workflows/methods/methodAi.py
+++ b/modules/workflows/methods/methodAi.py
@@ -121,6 +121,15 @@ class MethodAi(MethodBase):
mimeType=d.get("mimeType") or output_mime_type
))
+ # Preserve structured content field for validation (if it exists)
+ # This allows validator to see the actual structured data, not just rendered output
+ if "content" in result and result["content"] and isinstance(result["content"], (dict, list)):
+ action_documents.append(ActionDocument(
+ documentName="structured_content.json",
+ documentData=result["content"],
+ mimeType="application/json"
+ ))
+
final_documents = action_documents
else:
extension = output_extension.lstrip('.')
@@ -222,74 +231,3 @@ class MethodAi(MethodBase):
except:
pass
return ActionResult.isFailure(error=str(e))
-
-
- @action
- async def generateImage(self, parameters: Dict[str, Any]) -> ActionResult:
- """
- GENERAL:
- - Purpose: Generate images using AI based on text prompts.
- - Input requirements: prompt (required); optional size, quality, style.
- - Output format: Base64 encoded image data.
-
- Parameters:
- - prompt (str, required): Text description of the image to generate.
- - size (str, optional): Image size. Options: 1024x1024, 1792x1024, 1024x1792. Default: 1024x1024.
- - quality (str, optional): Image quality. Options: standard, hd. Default: standard.
- - style (str, optional): Image style. Options: vivid, natural. Default: vivid.
- """
- try:
- prompt = parameters.get("prompt")
- if not prompt:
- return ActionResult.isFailure(error="Image prompt is required")
-
- # Extract optional parameters
- size = parameters.get("size", "1024x1024")
- quality = parameters.get("quality", "standard")
- style = parameters.get("style", "vivid")
-
- # Build AI call options for image generation
- options = AiCallOptions(
- operationType=OperationTypeEnum.IMAGE_GENERATE,
- resultFormat="base64"
- )
-
- # Create structured prompt using Pydantic model
- promptModel = AiCallPromptImage(
- prompt=prompt,
- size=size,
- quality=quality,
- style=style
- )
-
- # Convert to JSON string for prompt
- promptJson = promptModel.model_dump_json(exclude_none=True, indent=2)
-
- # Call AI service through unified path
- result = await self.services.ai.callAiDocuments(
- prompt=promptJson,
- documents=None,
- options=options,
- outputFormat="base64"
- )
-
- # Create meaningful filename
- meaningfulName = self._generateMeaningfulFileName(
- base_name="generated_image",
- extension="png",
- action_name="generate"
- )
-
- from modules.datamodels.datamodelChat import ActionDocument
- actionDocument = ActionDocument(
- documentName=meaningfulName,
- documentData=result,
- mimeType="image/png"
- )
-
- return ActionResult.isSuccess(documents=[actionDocument])
-
- except Exception as e:
- logger.error(f"Error in image generation: {str(e)}")
- return ActionResult.isFailure(error=str(e))
-
diff --git a/modules/workflows/processing/adaptive/VALIDATOR_ANALYSIS.md b/modules/workflows/processing/adaptive/VALIDATOR_ANALYSIS.md
new file mode 100644
index 00000000..4b4ff387
--- /dev/null
+++ b/modules/workflows/processing/adaptive/VALIDATOR_ANALYSIS.md
@@ -0,0 +1,266 @@
+# Content Validator - Deep Analysis & Target Design
+
+## CURRENT STATE ANALYSIS
+
+### How Validator Currently Works
+
+#### 1. **Document Input Flow**
+```
+ActionResult.documents (List[ActionDocument])
+ → modeReact.py extracts "structured content" with hardcoded checks
+ → Creates SimpleNamespace objects with wrapped documentData
+ → Passes to ContentValidator.validateContent()
+```
+
+#### 2. **Current Problems in modeReact.py (Lines 99-136)**
+- ❌ **Hardcoded document name checks**: `docName == "structured_content.json"`
+- ❌ **Hardcoded mimeType checks**: `mimeType == "application/json"`
+- ❌ **Hardcoded structure checks**: `'content' in docData or 'documents' in docData or 'sections' in docData`
+- ❌ **Single document selection**: `break` after first match - ignores other documents
+- ❌ **Non-generic logic**: Specific to certain document structures
+- ❌ **Workaround approach**: Trying to find structured content in various ways
+
+#### 3. **Current Problems in contentValidator.py**
+
+**`_extractContent()` method (Lines 21-41)**:
+- ❌ **Inconsistent handling**: Checks for `dict with 'content'` but then also handles raw `data`
+- ❌ **Silent failures**: Returns empty string on any exception
+- ❌ **Size limit hardcoded**: 10KB threshold is arbitrary
+- ❌ **No format awareness**: Doesn't check if document is binary/base64 before extracting
+- ❌ **No document type detection**: Doesn't distinguish text vs binary vs structured data
+
+**`_validateWithAI()` method (Lines 60-200)**:
+- ❌ **Forces all content to string**: `content[:2000]` truncation assumes text
+- ❌ **No document metadata passed**: Only name and content, no size, format, mimeType info
+- ❌ **No binary/base64 handling**: Will fail or show garbage for binary documents
+- ❌ **Multiple JSON extraction strategies**: Indicates unreliable AI response parsing
+- ❌ **Size limits inconsistent**: 10KB in extraction, 2KB in prompt - why different?
+
+#### 4. **Missing Capabilities**
+- ❌ No document size reporting to validator
+- ❌ No format validation (txt vs md vs pdf vs docx)
+- ❌ No binary data handling (images, PDFs, etc.)
+- ❌ No document count/summary statistics
+- ❌ No distinction between document types for validation
+
+---
+
+## TARGET DESIGN
+
+### Core Principles
+1. **GENERIC**: No hardcoded document names, types, or structures
+2. **DOCUMENT-AWARE**: Handle all document types (text, binary, base64, structured)
+3. **SIZE-CONSCIOUS**: Never pass full large documents to AI
+4. **METADATA-RICH**: Pass document metadata (name, size, format, mimeType) to validator
+5. **FORMAT-FLEXIBLE**: Allow format flexibility (md ≈ text, but pdf ≠ docx)
+
+### Target Architecture
+
+```
+Documents Input (List[ActionDocument])
+ ↓
+Document Analyzer (generic)
+ - Extract metadata (name, size, mimeType, format)
+ - Determine content type (text/binary/base64/structured)
+ - Create preview/summary for large documents
+ ↓
+Document Summary (for AI validation)
+ - Metadata only for binary/base64
+ - Preview/sample for large text documents
+ - Full content for small text/structured documents
+ ↓
+Validation Prompt Builder (generic)
+ - Include document summaries (not full content)
+ - Include document metadata
+ - Include format validation rules (generic)
+ ↓
+AI Validator
+ - Validates against task objective (generic)
+ - Validates format compliance (flexible)
+ - Validates document count/size
+```
+
+---
+
+## REQUIRED CHANGES
+
+### 1. **Remove All Hardcoded Checks from modeReact.py**
+- ❌ Remove document name checks
+- ❌ Remove mimeType-specific logic
+- ❌ Remove structure-specific checks
+- ✅ Pass ALL documents to validator (let validator decide what to validate)
+- ✅ Keep it simple: `validationDocs = result.documents`
+
+### 2. **Redesign contentValidator.py - New Structure**
+
+#### New Method: `_analyzeDocuments(documents)`
+```python
+def _analyzeDocuments(self, documents: List[Any]) -> List[Dict[str, Any]]:
+ """
+ Generic document analysis - extract metadata and create summaries.
+ Returns list of document summaries ready for validation prompt.
+ """
+ summaries = []
+ for doc in documents:
+ summary = {
+ "name": getattr(doc, 'documentName', 'Unknown'),
+ "mimeType": getattr(doc, 'mimeType', 'unknown'),
+ "format": self._detectFormat(doc),
+ "size": self._calculateSize(doc),
+ "type": self._detectContentType(doc), # text/binary/base64/structured
+ "preview": self._createPreview(doc), # None for binary, sample for large text
+ "isAccessible": self._isContentAccessible(doc) # Can we read content?
+ }
+ summaries.append(summary)
+ return summaries
+```
+
+#### New Method: `_detectFormat(doc)`
+- Extract from filename extension or mimeType
+- Generic mapping: `text/plain` → `txt`, `text/markdown` → `md`, etc.
+- Return format string (txt, md, pdf, docx, json, etc.)
+
+#### New Method: `_calculateSize(doc)`
+- Calculate document size in bytes
+- Handle string, dict, list, bytes, base64
+- Return: `{"bytes": int, "readable": "1.5 MB"}`
+
+#### New Method: `_detectContentType(doc)`
+- `text`: Readable text content
+- `structured`: JSON/dict/list structures
+- `binary`: Binary data (PDF, images, etc.)
+- `base64`: Base64-encoded data
+- Return content type string
+
+#### New Method: `_createPreview(doc)`
+- **Binary/Base64**: Return `None` (metadata only)
+- **Large text (>50KB)**: Return first 1KB + size indicator
+- **Small text (≤50KB)**: Return full content
+- **Structured data**: Return JSON string (truncated if large)
+
+#### New Method: `_isContentAccessible(doc)`
+- Check if document content can be extracted for validation
+- Binary/base64 documents: `False` (validate by metadata only)
+- Text/structured documents: `True`
+
+### 3. **Redesign Validation Prompt (Generic)**
+
+```python
+validationPrompt = f"""TASK VALIDATION
+
+USER REQUEST: '{intent.get('primaryGoal', 'Unknown')}'
+EXPECTED DATA TYPE: {intent.get('dataType', 'unknown')}
+EXPECTED FORMAT: {intent.get('expectedFormat', 'unknown')}
+SUCCESS CRITERIA ({criteriaCount} items): {successCriteria}
+
+DELIVERED DOCUMENTS ({len(documentSummaries)} items):
+{json.dumps(documentSummaries, indent=2)}
+
+VALIDATION RULES:
+1. Check if delivered documents match expected data type
+2. Check if delivered formats are compatible with expected format
+ (Note: text formats like txt/md are compatible; pdf ≠ docx but both are documents)
+3. Verify each success criterion is met based on document content/metadata
+4. Check document sizes are reasonable for the task
+5. Rate overall quality (0.0-1.0)
+6. Identify specific gaps
+7. Suggest next steps
+
+OUTPUT FORMAT - JSON ONLY (no prose):
+{{
+ "overallSuccess": false,
+ "qualityScore": 0.0,
+ "dataTypeMatch": false,
+ "formatMatch": false,
+ "documentCount": {len(documentSummaries)},
+ "successCriteriaMet": {[False] * criteriaCount},
+ "gapAnalysis": "Specific gaps found",
+ "improvementSuggestions": ["NEXT STEP: Action 1"],
+ "validationDetails": [
+ {{
+ "documentName": "document.ext",
+ "issues": ["Issue 1"],
+ "suggestions": ["NEXT STEP: Fix 1"]
+ }}
+ ]
+}}
+"""
+```
+
+### 4. **Format Validation Logic (Generic & Flexible)**
+
+```python
+def _isFormatCompatible(self, deliveredFormat: str, expectedFormat: str) -> bool:
+ """
+ Generic format compatibility check.
+ - txt/md/html are text formats (compatible with each other)
+ - pdf/docx/xlsx are document formats (not compatible with each other)
+ - json/xml are structured formats
+ - images are image formats
+ """
+ # Text formats are interchangeable
+ textFormats = ['txt', 'md', 'html', 'text', 'plain']
+ if deliveredFormat.lower() in textFormats and expectedFormat.lower() in textFormats:
+ return True
+
+ # Exact match
+ if deliveredFormat.lower() == expectedFormat.lower():
+ return True
+
+ # Structured formats
+ if deliveredFormat.lower() in ['json', 'xml'] and expectedFormat.lower() in ['json', 'xml']:
+ return True # Could be made more flexible
+
+ return False
+```
+
+---
+
+## IMPLEMENTATION PLAN
+
+### Phase 1: Clean Up modeReact.py
+- Remove all hardcoded checks
+- Simply pass `result.documents` to validator
+
+### Phase 2: Redesign Document Analysis
+- Implement `_analyzeDocuments()`
+- Implement helper methods: `_detectFormat()`, `_calculateSize()`, `_detectContentType()`, `_createPreview()`
+
+### Phase 3: Redesign Validation Prompt
+- Generic prompt with document summaries
+- Include metadata, not full content
+- Size-aware handling
+
+### Phase 4: Implement Format Validation
+- Generic format compatibility logic
+- Flexible matching (text formats, document formats, etc.)
+
+### Phase 5: Testing
+- Test with text documents (small & large)
+- Test with binary documents (PDF, images)
+- Test with base64 documents
+- Test with structured data (JSON)
+
+---
+
+## KEY DESIGN DECISIONS
+
+1. **Pass ALL documents**: Validator decides what to validate, not the caller
+2. **Metadata over content**: For large/binary documents, pass metadata only
+3. **Preview samples**: For large text documents, pass preview + size info
+4. **Generic prompts**: No task-specific or format-specific logic
+5. **Flexible format matching**: Text formats compatible, document formats strict
+6. **Size limits**: 50KB threshold for full content (configurable)
+7. **Content type detection**: Explicit type detection (text/binary/base64/structured)
+
+---
+
+## BENEFITS OF TARGET DESIGN
+
+✅ **Generic**: Works with any document type without hardcoding
+✅ **Scalable**: Handles large documents without issues
+✅ **Flexible**: Format validation is flexible where appropriate
+✅ **Maintainable**: Clear separation of concerns
+✅ **Robust**: Handles edge cases (binary, base64, large files)
+✅ **Testable**: Each component can be tested independently
+
diff --git a/modules/workflows/processing/adaptive/contentValidator.py b/modules/workflows/processing/adaptive/contentValidator.py
index 6ac81df8..d13e0dfc 100644
--- a/modules/workflows/processing/adaptive/contentValidator.py
+++ b/modules/workflows/processing/adaptive/contentValidator.py
@@ -1,14 +1,22 @@
# contentValidator.py
# Content validation for adaptive React mode
+# Generic, document-aware validation system
import logging
import json
-from typing import List, Dict, Any
+import base64
+import re
+from typing import List, Dict, Any, Optional
logger = logging.getLogger(__name__)
+# Configuration constants
+MAX_CONTENT_SIZE_FOR_FULL_PREVIEW = 50 * 1024 # 50KB threshold
+PREVIEW_SAMPLE_SIZE = 1024 # 1KB preview for large documents
+
+
class ContentValidator:
- """Validates delivered content against user intent"""
+ """Validates delivered content against user intent - generic and document-aware"""
def __init__(self, services=None, learningEngine=None):
self.services = services
@@ -18,78 +26,277 @@ class ContentValidator:
"""Validates delivered content against user intent using AI (single attempt; parse-or-fail)"""
return await self._validateWithAI(documents, intent)
- def _extractContent(self, doc: Any) -> str:
- """Extracts content from a document with size protection for large documents"""
+ def _analyzeDocuments(self, documents: List[Any]) -> List[Dict[str, Any]]:
+ """Generic document analysis - create simple summaries with metadata."""
+ summaries = []
+ for doc in documents:
+ try:
+ data = getattr(doc, 'documentData', None)
+ name = getattr(doc, 'documentName', 'Unknown')
+ mimeType = getattr(doc, 'mimeType', 'unknown')
+ formatExt = self._detectFormat(doc)
+ sizeInfo = self._calculateSize(doc)
+
+ # Simple preview: if it's dict/list, dump JSON; otherwise use string
+ preview = None
+ if data is not None:
+ if isinstance(data, (dict, list)):
+ preview = json.dumps(data, indent=2, ensure_ascii=False)
+ # Truncate if too large
+ if len(preview) > MAX_CONTENT_SIZE_FOR_FULL_PREVIEW:
+ preview = preview[:PREVIEW_SAMPLE_SIZE] + f"\n\n[Truncated - {self._formatBytes(sizeInfo['bytes'])} total]"
+ else:
+ text = str(data)
+ if len(text) > MAX_CONTENT_SIZE_FOR_FULL_PREVIEW:
+ preview = text[:PREVIEW_SAMPLE_SIZE] + f"\n\n[Truncated - {self._formatBytes(sizeInfo['bytes'])} total]"
+ else:
+ preview = text
+
+ summary = {
+ "name": name,
+ "mimeType": mimeType,
+ "format": formatExt,
+ "size": sizeInfo["readable"],
+ "preview": preview
+ }
+ summaries.append(summary)
+ except Exception as e:
+ logger.warning(f"Error analyzing document {getattr(doc, 'documentName', 'Unknown')}: {str(e)}")
+ summaries.append({
+ "name": getattr(doc, 'documentName', 'Unknown'),
+ "mimeType": getattr(doc, 'mimeType', 'unknown'),
+ "format": "unknown",
+ "size": "0 B",
+ "preview": None,
+ "error": str(e)
+ })
+ return summaries
+
+ def _calculateAvailablePromptSpace(self, basePromptSizeBytes: int) -> int:
+ """Calculate available space for document summaries based on model context length."""
try:
- if hasattr(doc, 'documentData'):
- data = doc.documentData
- if isinstance(data, dict) and 'content' in data:
- content = data['content']
- # For large content, check size before converting to string
- if hasattr(content, '__len__') and len(str(content)) > 10000: # 10KB threshold
- # For very large content, return a size indicator instead
- return f"[Large document content - {len(str(content))} characters - truncated for validation]"
- return str(content)
- else:
- content = data
- # For large content, check size before converting to string
- if hasattr(content, '__len__') and len(str(content)) > 10000: # 10KB threshold
- return f"[Large document content - {len(str(content))} characters - truncated for validation]"
- return str(content)
- return ""
- except Exception:
- return ""
+ from modules.aicore.aicoreModelRegistry import modelRegistry
+ from modules.aicore.aicoreModelSelector import modelSelector
+ from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
+
+ # Get available models
+ availableModels = modelRegistry.getAvailableModels()
+
+ # Create options for PLAN operation (what validation uses)
+ options = AiCallOptions(
+ operationType=OperationTypeEnum.PLAN,
+ priority=None,
+ processingMode=None
+ )
+
+ # Get failover model list to find the model that will be used
+ failoverModels = modelSelector.getFailoverModelList("", "", options, availableModels)
+
+ if not failoverModels:
+ # Fallback: assume 16K tokens context (conservative)
+ logger.warning("No models available for space calculation, using fallback: 16K tokens")
+ maxBytes = 16 * 1024 * 4 # 16K tokens * 4 bytes per token
+ else:
+ # Use the first (best) model
+ model = failoverModels[0]
+ # Calculate 80% of context length in bytes (tokens * 4 bytes per token)
+ maxBytes = int(model.contextLength * 0.8 * 4)
+
+ # Available space = max - base prompt - safety margin (10%)
+ availableBytes = int((maxBytes - basePromptSizeBytes) * 0.9)
+
+ # Ensure minimum available space (at least 1KB)
+ availableBytes = max(availableBytes, 1024)
+
+ logger.debug(f"Prompt space calculation: base={basePromptSizeBytes} bytes, max={maxBytes} bytes, available={availableBytes} bytes")
+
+ return availableBytes
+
+ except Exception as e:
+ logger.warning(f"Error calculating available prompt space: {str(e)}, using fallback: 8KB")
+ # Fallback: assume 8KB available
+ return 8 * 1024
- # Removed schema fallback creator to keep failures explicit
+ def _analyzeDocumentsWithSizeLimit(self, documents: List[Any], maxTotalBytes: int) -> List[Dict[str, Any]]:
+ """Analyze documents with size limit, dividing available space evenly among documents."""
+ if not documents:
+ return []
+
+ # Reserve space for JSON structure overhead (approximately 200 bytes per document)
+ jsonOverheadPerDoc = 200
+ reservedOverhead = len(documents) * jsonOverheadPerDoc
+ availableForContent = max(0, maxTotalBytes - reservedOverhead)
+
+ # Divide available space evenly among documents
+ bytesPerDoc = availableForContent // len(documents) if documents else 0
+ # Ensure minimum space per document (at least 100 bytes)
+ bytesPerDoc = max(bytesPerDoc, 100)
+
+ logger.debug(f"Document summary space: total={maxTotalBytes} bytes, available={availableForContent} bytes, perDoc={bytesPerDoc} bytes")
+
+ summaries = []
+ for doc in documents:
+ try:
+ data = getattr(doc, 'documentData', None)
+ name = getattr(doc, 'documentName', 'Unknown')
+ mimeType = getattr(doc, 'mimeType', 'unknown')
+ formatExt = self._detectFormat(doc)
+ sizeInfo = self._calculateSize(doc)
+
+ # Create preview with size limit
+ preview = None
+ if data is not None:
+ if isinstance(data, (dict, list)):
+ preview = json.dumps(data, indent=2, ensure_ascii=False)
+ else:
+ preview = str(data)
+
+ # Truncate preview to fit within bytesPerDoc (accounting for JSON structure)
+ # Estimate: preview takes ~70% of document summary space
+ maxPreviewBytes = int(bytesPerDoc * 0.7)
+ previewBytes = len(preview.encode('utf-8'))
+
+ if previewBytes > maxPreviewBytes:
+ # Truncate to fit
+ truncated = preview.encode('utf-8')[:maxPreviewBytes]
+ # Try to decode safely
+ try:
+ preview = truncated.decode('utf-8', errors='ignore')
+ except:
+ preview = truncated[:maxPreviewBytes-50].decode('utf-8', errors='ignore')
+ preview += f"\n\n[Truncated - {self._formatBytes(sizeInfo['bytes'])} total]"
+
+ summary = {
+ "name": name,
+ "mimeType": mimeType,
+ "format": formatExt,
+ "size": sizeInfo["readable"],
+ "preview": preview
+ }
+ summaries.append(summary)
+ except Exception as e:
+ logger.warning(f"Error analyzing document {getattr(doc, 'documentName', 'Unknown')}: {str(e)}")
+ summaries.append({
+ "name": getattr(doc, 'documentName', 'Unknown'),
+ "mimeType": getattr(doc, 'mimeType', 'unknown'),
+ "format": "unknown",
+ "size": "0 B",
+ "preview": None,
+ "error": str(e)
+ })
+
+ return summaries
- def _isValidJsonResponse(self, response: str) -> bool:
- """Checks if response contains valid JSON structure"""
+ def _detectFormat(self, doc: Any) -> str:
+ """Extract format from filename extension (always use extension)"""
try:
- import re
- # Look for JSON with expected structure
- json_match = re.search(r'\{[^{}]*"overallSuccess"[^{}]*\}', response, re.DOTALL)
- if json_match:
- json.loads(json_match.group(0))
- return True
- return False
- except:
- return False
+ docName = getattr(doc, 'documentName', '')
+
+ # Extract from filename extension
+ if docName and '.' in docName:
+ ext = docName.rsplit('.', 1)[1].lower()
+ return ext
+
+ return 'unknown'
+ except Exception as e:
+ logger.warning(f"Error detecting format: {str(e)}")
+ return 'unknown'
- # Removed text-based fallback extraction to avoid hiding issues
+ def _calculateSize(self, doc: Any) -> Dict[str, Any]:
+ """Calculate document size in bytes and human-readable format"""
+ try:
+ if not hasattr(doc, 'documentData') or doc.documentData is None:
+ return {"bytes": 0, "readable": "0 B"}
+
+ data = doc.documentData
+ size_bytes = 0
+
+ if isinstance(data, str):
+ size_bytes = len(data.encode('utf-8'))
+ elif isinstance(data, bytes):
+ size_bytes = len(data)
+ elif isinstance(data, (dict, list)):
+ # Estimate JSON size
+ try:
+ json_str = json.dumps(data)
+ size_bytes = len(json_str.encode('utf-8'))
+ except:
+ size_bytes = len(str(data).encode('utf-8'))
+ else:
+ size_bytes = len(str(data).encode('utf-8'))
+
+ # Convert to human-readable format
+ readable = self._formatBytes(size_bytes)
+
+ return {"bytes": size_bytes, "readable": readable}
+ except Exception as e:
+ logger.warning(f"Error calculating size: {str(e)}")
+ return {"bytes": 0, "readable": "0 B"}
+
+ def _formatBytes(self, bytes: int) -> str:
+ """Format bytes to human-readable string"""
+ for unit in ['B', 'KB', 'MB', 'GB']:
+ if bytes < 1024.0:
+ return f"{bytes:.1f} {unit}"
+ bytes /= 1024.0
+ return f"{bytes:.1f} TB"
+
+
+ def _isFormatCompatible(self, deliveredFormat: str, expectedFormat: str) -> bool:
+ """
+ Generic format compatibility check.
+ - txt/md/html are text formats (compatible with each other)
+ - pdf/docx/xlsx are document formats (not compatible with each other)
+ - json/xml are structured formats
+ - images are image formats
+ """
+ deliveredLower = deliveredFormat.lower()
+ expectedLower = expectedFormat.lower()
+
+ # Exact match
+ if deliveredLower == expectedLower:
+ return True
+
+ # Text formats are interchangeable
+ textFormats = ['txt', 'md', 'html', 'text', 'plain']
+ if deliveredLower in textFormats and expectedLower in textFormats:
+ return True
+
+ # Structured formats
+ if deliveredLower in ['json', 'xml'] and expectedLower in ['json', 'xml']:
+ return True
+
+ # Document formats are NOT compatible with each other
+ documentFormats = ['pdf', 'docx', 'xlsx', 'pptx']
+ if deliveredLower in documentFormats and expectedLower in documentFormats:
+ return False # pdf ≠ docx
+
+ return False
async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
- """AI-based comprehensive validation - single main function"""
+ """AI-based comprehensive validation - generic approach"""
try:
if not hasattr(self, 'services') or not self.services or not hasattr(self.services, 'ai'):
return self._createFailedValidationResult("AI service not available")
- # Extract content from all documents
- documentContents = []
- for doc in documents:
- content = self._extractContent(doc)
- documentContents.append({
- "name": getattr(doc, 'documentName', 'Unknown'),
- "content": content[:2000] # Limit content for AI processing
- })
-
- # Create structured AI validation prompt
+ # Build prompt base WITHOUT document summaries first
successCriteria = intent.get('successCriteria', [])
criteriaCount = len(successCriteria)
- validationPrompt = f"""TASK VALIDATION
+ promptBase = f"""TASK VALIDATION
USER REQUEST: '{intent.get('primaryGoal', 'Unknown')}'
-EXPECTED TYPE: {intent.get('dataType', 'unknown')}
+EXPECTED DATA TYPE: {intent.get('dataType', 'unknown')}
EXPECTED FORMAT: {intent.get('expectedFormat', 'unknown')}
SUCCESS CRITERIA ({criteriaCount} items): {successCriteria}
VALIDATION RULES:
-1. Check if content matches expected data type
-2. Check if content matches expected format
-3. Verify each success criterion is met
-4. Rate overall quality (0.0-1.0)
-5. Identify specific gaps
-6. Suggest next steps
+1. Check if delivered documents match expected data type
+2. Check if delivered formats are compatible with expected format
+3. Verify each success criterion is met based on document content/metadata
+4. Check document sizes are reasonable for the task
+5. Rate overall quality (0.0-1.0)
+6. Identify specific gaps based on what the user requested
OUTPUT FORMAT - JSON ONLY (no prose):
{{
@@ -97,31 +304,45 @@ OUTPUT FORMAT - JSON ONLY (no prose):
"qualityScore": 0.0,
"dataTypeMatch": false,
"formatMatch": false,
+ "documentCount": {len(documents)},
"successCriteriaMet": {[False] * criteriaCount},
- "gapAnalysis": "Specific gaps found",
- "improvementSuggestions": ["NEXT STEP: Action 1", "NEXT STEP: Action 2"],
+ "gapAnalysis": "Describe what is missing or incorrect",
+ "improvementSuggestions": ["General action to improve overall result"],
"validationDetails": [
{{
- "documentName": "Document Name",
- "issues": ["Issue 1", "Issue 2"],
- "suggestions": ["NEXT STEP: Fix 1", "NEXT STEP: Fix 2"]
+ "documentName": "document.ext",
+ "issues": ["Specific problem with this document"],
+ "suggestions": ["Specific fix for this document's issues"]
}}
]
}}
-DELIVERED CONTENT TO CHECK:
-{json.dumps(documentContents, indent=2)}
+Field explanations:
+- "improvementSuggestions": Overall actions to improve the entire result (general, high-level)
+- "validationDetails[].suggestions": Specific fixes for each document's individual issues (document-specific, detailed)
+- Do NOT use prefixes like "NEXT STEP:" - describe actions directly
+DELIVERED DOCUMENTS ({len(documents)} items):
"""
+ # Calculate available space for document summaries
+ # Get the model that will be used for validation
+ basePromptSize = len(promptBase.encode('utf-8'))
+ availableBytes = self._calculateAvailablePromptSpace(basePromptSize)
+
+ # Analyze documents with size constraints
+ documentSummaries = self._analyzeDocumentsWithSizeLimit(documents, availableBytes)
+
+ # Build final prompt with summaries at the end
+ documentsJson = json.dumps(documentSummaries, indent=2)
+ validationPrompt = promptBase + documentsJson
+
# Call AI service for validation
response = await self.services.ai.callAiPlanning(
prompt=validationPrompt,
placeholders=None
)
- # No retries or correction prompts here; parse-or-fail below
-
if not response or not response.strip():
logger.warning("AI validation returned empty response")
raise ValueError("AI validation failed - empty response")
@@ -131,8 +352,6 @@ DELIVERED CONTENT TO CHECK:
logger.debug(f"AI validation response length: {len(result)}")
# Try to find JSON in the response with multiple strategies
- import re
-
# Strategy 1: Look for JSON in markdown code blocks
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result, re.DOTALL)
if json_match:
@@ -168,6 +387,7 @@ DELIVERED CONTENT TO CHECK:
normalized = {
"overallSuccess": overall if isinstance(overall, bool) else None,
"qualityScore": float(quality) if isinstance(quality, (int, float)) else None,
+ "documentCount": len(documentSummaries),
"validationDetails": details if isinstance(details, list) else [{
"documentName": "AI Validation",
"gapAnalysis": gap,
@@ -197,4 +417,22 @@ DELIVERED CONTENT TO CHECK:
except Exception as e:
logger.error(f"AI validation failed: {str(e)}")
- raise
\ No newline at end of file
+ raise
+
+ def _createFailedValidationResult(self, errorMessage: str) -> Dict[str, Any]:
+ """Create a standardized failed validation result"""
+ return {
+ "overallSuccess": False,
+ "qualityScore": 0.0,
+ "dataTypeMatch": False,
+ "formatMatch": False,
+ "documentCount": 0,
+ "successCriteriaMet": [],
+ "gapAnalysis": errorMessage,
+ "improvementSuggestions": [],
+ "validationDetails": [],
+ "schemaCompliant": True,
+ "originalType": "error",
+ "missingFields": [],
+ "error": errorMessage
+ }
diff --git a/modules/workflows/processing/modes/modeReact.py b/modules/workflows/processing/modes/modeReact.py
index 6aa6505a..3b8d9910 100644
--- a/modules/workflows/processing/modes/modeReact.py
+++ b/modules/workflows/processing/modes/modeReact.py
@@ -96,14 +96,10 @@ class ReactMode(BaseMode):
# Attach deterministic label for clarity
observation['resultLabel'] = result.resultLabel
- # NEW: Add content validation (against original cleaned user prompt / workflow intent)
+ # Content validation (against original cleaned user prompt / workflow intent)
if getattr(self, 'workflowIntent', None) and result.documents:
- # Validate ONLY the produced JSON (structured content), not rendered files
- from types import SimpleNamespace
- validationDocs = []
- if hasattr(result, 'content') and result.content:
- validationDocs.append(SimpleNamespace(documentName='generated.json', documentData={'content': result.content}))
- validationResult = await self.contentValidator.validateContent(validationDocs, self.workflowIntent)
+ # Pass ALL documents to validator - validator decides what to validate (generic approach)
+ validationResult = await self.contentValidator.validateContent(result.documents, self.workflowIntent)
observation['contentValidation'] = validationResult
quality_score = validationResult.get('qualityScore', 0.0)
if quality_score is None:
diff --git a/naming_violations_report.csv b/naming_violations_report.csv
deleted file mode 100644
index a80259f6..00000000
--- a/naming_violations_report.csv
+++ /dev/null
@@ -1,107 +0,0 @@
-Module,Function Names,Parameter Names,Variable Names,Total
-modules/workflows/methods/methodSharepoint.py,0,2,211,213
-modules/workflows/methods/methodOutlook.py,0,3,131,134
-modules/services/serviceAi/subDocumentProcessing.py,0,0,104,104
-modules/features/syncDelta/mainSyncDelta.py,1,10,88,99
-modules/shared/jsonUtils.py,0,3,88,91
-modules/services/serviceGeneration/renderers/rendererDocx.py,3,8,79,90
-modules/services/serviceWorkflow/mainServiceWorkflow.py,0,3,85,88
-modules/services/serviceGeneration/renderers/rendererPptx.py,2,7,73,82
-modules/services/serviceGeneration/renderers/rendererPdf.py,3,8,50,61
-modules/connectors/connectorVoiceGoogle.py,1,2,52,55
-modules/services/serviceGeneration/renderers/rendererHtml.py,3,6,46,55
-modules/services/serviceGeneration/renderers/rendererBaseTemplate.py,3,21,27,51
-modules/shared/configuration.py,2,17,30,49
-modules/services/serviceExtraction/subMerger.py,2,5,31,38
-modules/connectors/connectorDbPostgre.py,0,14,20,34
-modules/interfaces/interfaceDbAppObjects.py,0,8,26,34
-modules/routes/routeSecurityGoogle.py,0,0,32,32
-modules/shared/attributeUtils.py,3,4,25,32
-modules/interfaces/interfaceDbChatObjects.py,0,4,27,31
-modules/routes/routeSecurityAdmin.py,0,2,28,30
-modules/services/serviceNeutralization/subProcessList.py,7,0,22,29
-modules/services/serviceGeneration/renderers/rendererText.py,3,7,19,29
-modules/routes/routeSecurityMsft.py,0,0,27,27
-modules/services/serviceGeneration/renderers/rendererMarkdown.py,3,7,17,27
-modules/services/serviceGeneration/renderers/rendererXlsx.py,3,0,24,27
-modules/services/serviceGeneration/renderers/rendererImage.py,3,2,21,26
-modules/security/tokenManager.py,4,7,14,25
-modules/workflows/workflowManager.py,0,0,25,25
-modules/services/serviceGeneration/renderers/rendererCsv.py,3,5,17,25
-modules/shared/auditLogger.py,5,16,3,24
-modules/shared/debugLogger.py,0,0,24,24
-modules/workflows/processing/shared/placeholderFactory.py,0,0,24,24
-modules/interfaces/interfaceDbAppAccess.py,0,2,21,23
-modules/connectors/connectorTicketsJira.py,0,0,22,22
-modules/services/serviceGeneration/renderers/registry.py,7,3,12,22
-modules/routes/routeDataConnections.py,1,1,19,21
-modules/security/tokenRefreshService.py,0,2,19,21
-modules/services/serviceExtraction/extractors/extractorPptx.py,0,1,16,17
-modules/routes/routeSecurityLocal.py,0,0,16,16
-modules/workflows/methods/methodBase.py,0,4,12,16
-modules/services/serviceGeneration/mainServiceGeneration.py,0,4,11,15
-modules/services/serviceUtils/mainServiceUtils.py,0,14,1,15
-modules/features/neutralizePlayground/mainNeutralizePlayground.py,8,5,2,15
-modules/interfaces/interfaceTicketObjects.py,0,5,9,14
-modules/services/serviceNeutralization/subParseString.py,7,0,6,13
-modules/workflows/processing/modes/modeReact.py,0,1,11,12
-modules/interfaces/interfaceDbComponentAccess.py,0,2,9,11
-modules/services/serviceAi/subCoreAi.py,0,0,11,11
-modules/services/serviceExtraction/subRegistry.py,0,0,11,11
-modules/services/serviceNeutralization/mainServiceNeutralization.py,0,2,9,11
-modules/interfaces/interfaceAiObjects.py,0,0,10,10
-modules/services/serviceAi/subSharedAiUtils.py,0,3,7,10
-modules/connectors/connectorDbJson.py,0,3,6,9
-modules/workflows/methods/methodAi.py,0,0,9,9
-modules/services/serviceExtraction/subPromptBuilderExtraction.py,0,0,9,9
-modules/services/serviceGeneration/subDocumentUtility.py,0,3,6,9
-modules/services/serviceNeutralization/subProcessCommon.py,7,2,0,9
-modules/services/serviceNeutralization/subProcessText.py,5,0,4,9
-modules/interfaces/interfaceDbChatAccess.py,0,2,6,8
-modules/security/auth.py,0,1,7,8
-modules/aicore/aicorePluginAnthropic.py,0,0,7,7
-modules/security/tokenRefreshMiddleware.py,0,2,4,6
-modules/services/serviceGeneration/renderers/rendererJson.py,3,0,3,6
-analyze_naming_violations.py,5,0,0,5
-modules/aicore/aicorePluginOpenai.py,0,0,5,5
-modules/routes/routeVoiceGoogle.py,0,0,5,5
-modules/shared/eventManagement.py,2,3,0,5
-modules/workflows/processing/adaptive/intentAnalyzer.py,0,0,5,5
-modules/workflows/processing/shared/executionState.py,0,5,0,5
-modules/services/serviceGeneration/subJsonSchema.py,0,0,5,5
-modules/services/serviceNeutralization/subPatterns.py,5,0,0,5
-modules/services/serviceNeutralization/subProcessBinary.py,4,0,1,5
-modules/services/serviceExtraction/extractors/extractorXlsx.py,0,0,5,5
-modules/interfaces/interfaceDbComponentObjects.py,0,3,1,4
-modules/routes/routeDataNeutralization.py,0,0,4,4
-modules/routes/routeWorkflows.py,0,0,4,4
-modules/shared/timezoneUtils.py,3,1,0,4
-modules/workflows/processing/adaptive/contentValidator.py,0,0,4,4
-modules/workflows/processing/core/messageCreator.py,0,0,4,4
-modules/services/serviceSharepoint/mainServiceSharepoint.py,0,0,4,4
-modules/routes/routeDataUsers.py,0,0,3,3
-modules/services/serviceExtraction/subPipeline.py,0,0,3,3
-app.py,0,0,2,2
-modules/datamodels/datamodelChat.py,0,1,1,2
-modules/routes/routeAttributes.py,0,0,2,2
-modules/routes/routeDataPrompts.py,0,0,2,2
-modules/security/csrf.py,0,1,1,2
-modules/security/jwtService.py,0,0,2,2
-modules/workflows/processing/adaptive/learningEngine.py,0,0,2,2
-modules/workflows/processing/modes/modeActionplan.py,0,0,2,2
-modules/workflows/processing/shared/methodDiscovery.py,0,0,2,2
-modules/services/serviceNormalization/mainServiceNormalization.py,0,0,2,2
-modules/services/serviceExtraction/extractors/extractorImage.py,0,0,2,2
-modules/aicore/aicoreBase.py,0,0,1,1
-modules/aicore/aicoreModelSelector.py,0,0,1,1
-modules/connectors/connectorTicketsClickup.py,0,0,1,1
-modules/datamodels/datamodelDocument.py,0,1,0,1
-modules/datamodels/datamodelSecurity.py,0,0,1,1
-modules/routes/routeAdmin.py,0,0,1,1
-modules/routes/routeDataFiles.py,0,0,1,1
-modules/workflows/processing/workflowProcessor.py,0,0,1,1
-modules/workflows/processing/adaptive/adaptiveLearningEngine.py,0,0,1,1
-modules/workflows/processing/core/actionExecutor.py,0,0,1,1
-modules/workflows/processing/core/taskPlanner.py,0,0,1,1
-modules/workflows/processing/modes/modeBase.py,0,0,1,1
-modules/services/serviceAi/subDocumentGeneration.py,0,0,1,1
diff --git a/processDocumentsWithContinuation_usage_analysis.md b/processDocumentsWithContinuation_usage_analysis.md
deleted file mode 100644
index 3dedc405..00000000
--- a/processDocumentsWithContinuation_usage_analysis.md
+++ /dev/null
@@ -1,184 +0,0 @@
-# Analysis: `processDocumentsWithContinuation` and Subfunctions Usage
-
-## Executive Summary
-
-**FINDING**: The function `processDocumentsWithContinuation` in `subDocumentProcessing.py` is **NOT USED** anywhere in the active codebase. The continuation chain was only referenced by the deleted `subDocumentGeneration.py` module.
-
----
-
-## Main Function: `processDocumentsWithContinuation`
-
-**Location**: `gateway/modules/services/serviceAi/subDocumentProcessing.py:303`
-
-**Status**: ❌ **NOT USED**
-
-### Usage Search Results
-
-- ❌ No actual code calls to `.processDocumentsWithContinuation(`
-- ⚠️ Only mentioned in documentation files:
- - `wiki/poweron/appdoc/doc_system_function_relationship_ai.md` (documentation)
- - `gateway/callAiWithDocumentGeneration_usage_analysis.md` (previous analysis - noted it was called by deleted code)
-
-### Why It's Not Used
-
-The only caller was `subDocumentGeneration._processDocumentsUnified()` which we just deleted. The current active codebase uses `subCoreAi.callAiDocuments()` which has its own continuation logic via `_callAiWithLooping()`.
-
----
-
-## Function Call Chain Analysis
-
-```
-processDocumentsWithContinuation (line 303) - ❌ NOT USED
- ├─> _buildContinuationPrompt (line 319, 324) - ❌ ONLY USED HERE
- └─> _processWithContinuationLoop (line 322, 373) - ❌ ONLY USED HERE
- ├─> _buildContinuationIterationPrompt (line 393, 459) - ❌ ONLY USED HERE
- └─> processDocumentsPerChunkJsonWithPrompt (line 402) - ✅ USED ELSEWHERE
-```
-
----
-
-## Subfunction Analysis
-
-### 1. `_buildContinuationPrompt`
-**Location**: Line 324-371
-**Status**: ✅ **USED** (but only internally)
-**Called by**: `processDocumentsWithContinuation` (line 319)
-**Effectively**: ❌ **UNUSED** (because parent function is unused)
-
-**Internal Usage**:
-- Called from `processDocumentsWithContinuation` at line 319
-
-**Functionality**:
-- Builds a prompt with continuation instructions
-- Adds JSON structure requirements with `"continue": true/false` flag
-- Adds `continuation_context` field specification
-
-**Note**: This uses a different continuation pattern than `SubCoreAi._callAiWithLooping()`:
-- This uses `"continue": true/false + "continuation_context"` for document sections
-- SubCoreAi uses `buildContinuationContext()` with `last_raw_json`
-
----
-
-### 2. `_processWithContinuationLoop`
-**Location**: Line 373-457
-**Status**: ✅ **USED** (but only internally)
-**Called by**: `processDocumentsWithContinuation` (line 322)
-**Effectively**: ❌ **UNUSED** (because parent function is unused)
-
-**Internal Usage**:
-- Called from `processDocumentsWithContinuation` at line 322
-
-**External Dependencies**:
-- Calls `self._buildContinuationIterationPrompt()` (line 393)
-- Calls `self.processDocumentsPerChunkJsonWithPrompt()` (line 402)
-
-**Functionality**:
-- Implements continuation loop (max 10 iterations)
-- Accumulates sections across iterations
-- Checks `continue` flag and `continuation_context` to determine if more iterations needed
-- Builds final result with accumulated sections
-
----
-
-### 3. `_buildContinuationIterationPrompt`
-**Location**: Line 459-498
-**Status**: ✅ **USED** (but only internally)
-**Called by**: `_processWithContinuationLoop` (line 393)
-**Effectively**: ❌ **UNUSED** (because parent chain is unused)
-
-**Internal Usage**:
-- Called from `_processWithContinuationLoop` at line 393 (in loop, conditionally)
-
-**Functionality**:
-- Builds a prompt for continuation iteration with context
-- Includes summary of previously generated content (last 3 sections)
-- Includes continuation instructions with last section ID, element index, remaining requirements
-
----
-
-### 4. `processDocumentsPerChunkJsonWithPrompt`
-**Location**: Line 219-301
-**Status**: ✅ **USED ELSEWHERE**
-**Called by**:
- - `_processWithContinuationLoop` (line 402)
- - Also referenced in backup files (not active code)
-
-**Internal Usage**:
-- Called from `_processWithContinuationLoop` at line 402
-
-**External Usage Search**:
-- ✅ Used internally by continuation loop
-- ⚠️ Referenced in `local/backup/backup_mainServiceAi.py.txt` (backup file, not active)
-- ❌ Not used by any other active code
-
-**Functionality**:
-- Processes documents with per-chunk AI calls
-- Uses a custom prompt instead of default extraction prompt
-- Returns merged JSON document
-
-**Note**: This function itself is only used by the continuation loop. However, it's a more general function that could be useful, so it's not "dead code" - it's just currently only used by unused code.
-
----
-
-## Summary Table
-
-| Function | Line | Status | Called By | Effectively Used? |
-|----------|------|--------|-----------|-------------------|
-| `processDocumentsWithContinuation` | 303 | ❌ Not used | (external) | ❌ No |
-| `_buildContinuationPrompt` | 324 | ✅ Used internally | `processDocumentsWithContinuation:319` | ❌ No |
-| `_processWithContinuationLoop` | 373 | ✅ Used internally | `processDocumentsWithContinuation:322` | ❌ No |
-| `_buildContinuationIterationPrompt` | 459 | ✅ Used internally | `_processWithContinuationLoop:393` | ❌ No |
-| `processDocumentsPerChunkJsonWithPrompt` | 219 | ✅ Used internally | `_processWithContinuationLoop:402` | ⚠️ **ONLY USED BY UNUSED CODE** |
-
----
-
-## Current Active Implementation
-
-The active continuation logic is in `subCoreAi.callAiDocuments()` → `_callAiWithLooping()`:
-- Uses `buildGenerationPrompt()` with `continuationContext` parameter
-- Uses `buildContinuationContext()` to build context from sections
-- Different continuation pattern (uses `last_raw_json` instead of `continuation_context`)
-
----
-
-## Dead Code Identification
-
-**Completely Unused Chain** (can be safely removed):
-1. ✅ `processDocumentsWithContinuation` - entry point, not called
-2. ✅ `_buildContinuationPrompt` - only used by #1
-3. ✅ `_processWithContinuationLoop` - only used by #1
-4. ✅ `_buildContinuationIterationPrompt` - only used by #3
-
-**Potentially Unused** (only used by dead code):
-- ⚠️ `processDocumentsPerChunkJsonWithPrompt` - only caller is dead code, but function is general-purpose
-
----
-
-## Recommendations
-
-1. **Remove Dead Code Chain**: All four functions (`processDocumentsWithContinuation`, `_buildContinuationPrompt`, `_processWithContinuationLoop`, `_buildContinuationIterationPrompt`) can be safely removed.
-
-2. **For `processDocumentsPerChunkJsonWithPrompt`**:
- - **Option A**: Remove if not needed (it's only used by the dead continuation chain)
- - **Option B**: Keep if it might be useful for future custom prompt processing
- - **Recommendation**: Since it's a general-purpose function that could be useful, keep it but note that it's currently unused.
-
-3. **If Keeping**: Document why this continuation logic exists but is unused, or mark as deprecated/legacy alternative to `_callAiWithLooping()`.
-
----
-
-## Verification Commands
-
-To verify these findings:
-
-```bash
-# Search for actual function calls (should return no results for the main function)
-grep -r "\.processDocumentsWithContinuation(" gateway/ --exclude-dir=wiki --exclude-dir=local --exclude-dir=backup
-
-# Search for _buildContinuationPrompt usage (should only find the definition)
-grep -r "_buildContinuationPrompt" gateway/ --exclude-dir=wiki --exclude-dir=local --exclude-dir=backup --exclude="*.md"
-
-# Search for _processWithContinuationLoop usage (should only find the definition)
-grep -r "_processWithContinuationLoop" gateway/ --exclude-dir=wiki --exclude-dir=local --exclude-dir=backup --exclude="*.md"
-```
-