# contentValidator.py # Content validation for adaptive React mode # Generic, document-aware validation system import logging import json import base64 import re from typing import List, Dict, Any, Optional logger = logging.getLogger(__name__) # Configuration constants MAX_CONTENT_SIZE_FOR_FULL_PREVIEW = 50 * 1024 # 50KB threshold PREVIEW_SAMPLE_SIZE = 1024 # 1KB preview for large documents class ContentValidator: """Validates delivered content against user intent - generic and document-aware""" def __init__(self, services=None, learningEngine=None): self.services = services self.learningEngine = learningEngine async def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]: """Validates delivered content against user intent using AI (single attempt; parse-or-fail)""" return await self._validateWithAI(documents, intent) def _analyzeDocuments(self, documents: List[Any]) -> List[Dict[str, Any]]: """Generic document analysis - create simple summaries with metadata.""" summaries = [] for doc in documents: try: data = getattr(doc, 'documentData', None) name = getattr(doc, 'documentName', 'Unknown') mimeType = getattr(doc, 'mimeType', 'unknown') formatExt = self._detectFormat(doc) sizeInfo = self._calculateSize(doc) # Simple preview: if it's dict/list, dump JSON; otherwise use string preview = None if data is not None: if isinstance(data, (dict, list)): preview = json.dumps(data, indent=2, ensure_ascii=False) # Truncate if too large if len(preview) > MAX_CONTENT_SIZE_FOR_FULL_PREVIEW: preview = preview[:PREVIEW_SAMPLE_SIZE] + f"\n\n[Truncated - {self._formatBytes(sizeInfo['bytes'])} total]" else: text = str(data) if len(text) > MAX_CONTENT_SIZE_FOR_FULL_PREVIEW: preview = text[:PREVIEW_SAMPLE_SIZE] + f"\n\n[Truncated - {self._formatBytes(sizeInfo['bytes'])} total]" else: preview = text summary = { "name": name, "mimeType": mimeType, "format": formatExt, "size": sizeInfo["readable"], "preview": preview } summaries.append(summary) except Exception as e: logger.warning(f"Error analyzing document {getattr(doc, 'documentName', 'Unknown')}: {str(e)}") summaries.append({ "name": getattr(doc, 'documentName', 'Unknown'), "mimeType": getattr(doc, 'mimeType', 'unknown'), "format": "unknown", "size": "0 B", "preview": None, "error": str(e) }) return summaries def _calculateAvailablePromptSpace(self, basePromptSizeBytes: int) -> int: """Calculate available space for document summaries based on model context length.""" try: from modules.aicore.aicoreModelRegistry import modelRegistry from modules.aicore.aicoreModelSelector import modelSelector from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum # Get available models availableModels = modelRegistry.getAvailableModels() # Create options for PLAN operation (what validation uses) options = AiCallOptions( operationType=OperationTypeEnum.PLAN, priority=None, processingMode=None ) # Get failover model list to find the model that will be used failoverModels = modelSelector.getFailoverModelList("", "", options, availableModels) if not failoverModels: # Fallback: assume 16K tokens context (conservative) logger.warning("No models available for space calculation, using fallback: 16K tokens") maxBytes = 16 * 1024 * 4 # 16K tokens * 4 bytes per token else: # Use the first (best) model model = failoverModels[0] # Calculate 80% of context length in bytes (tokens * 4 bytes per token) maxBytes = int(model.contextLength * 0.8 * 4) # Available space = max - base prompt - safety margin (10%) availableBytes = int((maxBytes - basePromptSizeBytes) * 0.9) # Ensure minimum available space (at least 1KB) availableBytes = max(availableBytes, 1024) logger.debug(f"Prompt space calculation: base={basePromptSizeBytes} bytes, max={maxBytes} bytes, available={availableBytes} bytes") return availableBytes except Exception as e: logger.warning(f"Error calculating available prompt space: {str(e)}, using fallback: 8KB") # Fallback: assume 8KB available return 8 * 1024 def _analyzeDocumentsWithSizeLimit(self, documents: List[Any], maxTotalBytes: int) -> List[Dict[str, Any]]: """Analyze documents with size limit, dividing available space evenly among documents.""" if not documents: return [] # Reserve space for JSON structure overhead (approximately 200 bytes per document) jsonOverheadPerDoc = 200 reservedOverhead = len(documents) * jsonOverheadPerDoc availableForContent = max(0, maxTotalBytes - reservedOverhead) # Divide available space evenly among documents bytesPerDoc = availableForContent // len(documents) if documents else 0 # Ensure minimum space per document (at least 100 bytes) bytesPerDoc = max(bytesPerDoc, 100) logger.debug(f"Document summary space: total={maxTotalBytes} bytes, available={availableForContent} bytes, perDoc={bytesPerDoc} bytes") summaries = [] for doc in documents: try: data = getattr(doc, 'documentData', None) name = getattr(doc, 'documentName', 'Unknown') mimeType = getattr(doc, 'mimeType', 'unknown') formatExt = self._detectFormat(doc) sizeInfo = self._calculateSize(doc) # Create preview with size limit preview = None if data is not None: if isinstance(data, (dict, list)): preview = json.dumps(data, indent=2, ensure_ascii=False) else: preview = str(data) # Truncate preview to fit within bytesPerDoc (accounting for JSON structure) # Estimate: preview takes ~70% of document summary space maxPreviewBytes = int(bytesPerDoc * 0.7) previewBytes = len(preview.encode('utf-8')) if previewBytes > maxPreviewBytes: # Truncate to fit truncated = preview.encode('utf-8')[:maxPreviewBytes] # Try to decode safely try: preview = truncated.decode('utf-8', errors='ignore') except: preview = truncated[:maxPreviewBytes-50].decode('utf-8', errors='ignore') preview += f"\n\n[Truncated - {self._formatBytes(sizeInfo['bytes'])} total]" summary = { "name": name, "mimeType": mimeType, "format": formatExt, "size": sizeInfo["readable"], "preview": preview } summaries.append(summary) except Exception as e: logger.warning(f"Error analyzing document {getattr(doc, 'documentName', 'Unknown')}: {str(e)}") summaries.append({ "name": getattr(doc, 'documentName', 'Unknown'), "mimeType": getattr(doc, 'mimeType', 'unknown'), "format": "unknown", "size": "0 B", "preview": None, "error": str(e) }) return summaries def _detectFormat(self, doc: Any) -> str: """Extract format from filename extension (always use extension)""" try: docName = getattr(doc, 'documentName', '') # Extract from filename extension if docName and '.' in docName: ext = docName.rsplit('.', 1)[1].lower() return ext return 'unknown' except Exception as e: logger.warning(f"Error detecting format: {str(e)}") return 'unknown' def _calculateSize(self, doc: Any) -> Dict[str, Any]: """Calculate document size in bytes and human-readable format""" try: if not hasattr(doc, 'documentData') or doc.documentData is None: return {"bytes": 0, "readable": "0 B"} data = doc.documentData size_bytes = 0 if isinstance(data, str): size_bytes = len(data.encode('utf-8')) elif isinstance(data, bytes): size_bytes = len(data) elif isinstance(data, (dict, list)): # Estimate JSON size try: json_str = json.dumps(data) size_bytes = len(json_str.encode('utf-8')) except: size_bytes = len(str(data).encode('utf-8')) else: size_bytes = len(str(data).encode('utf-8')) # Convert to human-readable format readable = self._formatBytes(size_bytes) return {"bytes": size_bytes, "readable": readable} except Exception as e: logger.warning(f"Error calculating size: {str(e)}") return {"bytes": 0, "readable": "0 B"} def _formatBytes(self, bytes: int) -> str: """Format bytes to human-readable string""" for unit in ['B', 'KB', 'MB', 'GB']: if bytes < 1024.0: return f"{bytes:.1f} {unit}" bytes /= 1024.0 return f"{bytes:.1f} TB" def _isFormatCompatible(self, deliveredFormat: str, expectedFormat: str) -> bool: """ Generic format compatibility check. - txt/md/html are text formats (compatible with each other) - pdf/docx/xlsx are document formats (not compatible with each other) - json/xml are structured formats - images are image formats """ deliveredLower = deliveredFormat.lower() expectedLower = expectedFormat.lower() # Exact match if deliveredLower == expectedLower: return True # Text formats are interchangeable textFormats = ['txt', 'md', 'html', 'text', 'plain'] if deliveredLower in textFormats and expectedLower in textFormats: return True # Structured formats if deliveredLower in ['json', 'xml'] and expectedLower in ['json', 'xml']: return True # Document formats are NOT compatible with each other documentFormats = ['pdf', 'docx', 'xlsx', 'pptx'] if deliveredLower in documentFormats and expectedLower in documentFormats: return False # pdf ≠ docx return False async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]: """AI-based comprehensive validation - generic approach""" try: if not hasattr(self, 'services') or not self.services or not hasattr(self.services, 'ai'): return self._createFailedValidationResult("AI service not available") # Build prompt base WITHOUT document summaries first successCriteria = intent.get('successCriteria', []) criteriaCount = len(successCriteria) promptBase = f"""TASK VALIDATION USER REQUEST: '{intent.get('primaryGoal', 'Unknown')}' EXPECTED DATA TYPE: {intent.get('dataType', 'unknown')} EXPECTED FORMAT: {intent.get('expectedFormat', 'unknown')} SUCCESS CRITERIA ({criteriaCount} items): {successCriteria} VALIDATION RULES: 1. Check if delivered documents match expected data type 2. Check if delivered formats are compatible with expected format 3. Verify each success criterion is met based on document content/metadata 4. Check document sizes are reasonable for the task 5. Rate overall quality (0.0-1.0) 6. Identify specific gaps based on what the user requested OUTPUT FORMAT - JSON ONLY (no prose): {{ "overallSuccess": false, "qualityScore": 0.0, "dataTypeMatch": false, "formatMatch": false, "documentCount": {len(documents)}, "successCriteriaMet": {[False] * criteriaCount}, "gapAnalysis": "Describe what is missing or incorrect", "improvementSuggestions": ["General action to improve overall result"], "validationDetails": [ {{ "documentName": "document.ext", "issues": ["Specific problem with this document"], "suggestions": ["Specific fix for this document's issues"] }} ] }} Field explanations: - "improvementSuggestions": Overall actions to improve the entire result (general, high-level) - "validationDetails[].suggestions": Specific fixes for each document's individual issues (document-specific, detailed) - Do NOT use prefixes like "NEXT STEP:" - describe actions directly DELIVERED DOCUMENTS ({len(documents)} items): """ # Calculate available space for document summaries # Get the model that will be used for validation basePromptSize = len(promptBase.encode('utf-8')) availableBytes = self._calculateAvailablePromptSpace(basePromptSize) # Analyze documents with size constraints documentSummaries = self._analyzeDocumentsWithSizeLimit(documents, availableBytes) # Build final prompt with summaries at the end documentsJson = json.dumps(documentSummaries, indent=2) validationPrompt = promptBase + documentsJson # Call AI service for validation response = await self.services.ai.callAiPlanning( prompt=validationPrompt, placeholders=None ) if not response or not response.strip(): logger.warning("AI validation returned empty response") raise ValueError("AI validation failed - empty response") # Clean and extract JSON from response result = response.strip() logger.debug(f"AI validation response length: {len(result)}") # Try to find JSON in the response with multiple strategies # Strategy 1: Look for JSON in markdown code blocks json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result, re.DOTALL) if json_match: result = json_match.group(1) logger.debug(f"Extracted JSON from markdown code block: {result[:200]}...") else: # Strategy 2: Look for JSON object with proper structure json_match = re.search(r'\{[^{}]*"overallSuccess"[^{}]*\}', result, re.DOTALL) if not json_match: # Strategy 3: Look for any JSON object json_match = re.search(r'\{.*\}', result, re.DOTALL) if json_match: result = json_match.group(0) logger.debug(f"Extracted JSON directly: {result[:200]}...") else: logger.debug(f"No JSON found in AI response: {result[:200]}...") logger.debug(f"Full AI response: {result}") raise ValueError("AI validation failed - no JSON in response") try: aiResult = json.loads(result) logger.info("AI validation JSON parsed successfully") overall = aiResult.get("overallSuccess") quality = aiResult.get("qualityScore") details = aiResult.get("validationDetails") gap = aiResult.get("gapAnalysis", "") criteria = aiResult.get("successCriteriaMet") improvements = aiResult.get("improvementSuggestions", []) # Normalize while keeping failures explicit normalized = { "overallSuccess": overall if isinstance(overall, bool) else None, "qualityScore": float(quality) if isinstance(quality, (int, float)) else None, "documentCount": len(documentSummaries), "validationDetails": details if isinstance(details, list) else [{ "documentName": "AI Validation", "gapAnalysis": gap, "successCriteriaMet": criteria if isinstance(criteria, list) else [] }], "improvementSuggestions": improvements, "schemaCompliant": True, "originalType": "json", "missingFields": [] } if normalized["overallSuccess"] is None: normalized["missingFields"].append("overallSuccess") if normalized["qualityScore"] is None: normalized["missingFields"].append("qualityScore") if normalized["missingFields"]: normalized["schemaCompliant"] = False return normalized except json.JSONDecodeError as json_error: logger.warning(f"AI validation invalid JSON: {str(json_error)}") logger.debug(f"JSON content: {result}") raise raise ValueError("AI validation failed - no response") except Exception as e: logger.error(f"AI validation failed: {str(e)}") raise def _createFailedValidationResult(self, errorMessage: str) -> Dict[str, Any]: """Create a standardized failed validation result""" return { "overallSuccess": False, "qualityScore": 0.0, "dataTypeMatch": False, "formatMatch": False, "documentCount": 0, "successCriteriaMet": [], "gapAnalysis": errorMessage, "improvementSuggestions": [], "validationDetails": [], "schemaCompliant": True, "originalType": "error", "missingFields": [], "error": errorMessage }