gateway/modules/workflows/processing/adaptive/contentValidator.py

# contentValidator.py
# Content validation for adaptive React mode
# Generic, document-aware validation system

import logging
import json
import base64
import re
from typing import List, Dict, Any, Optional

logger = logging.getLogger(__name__)

# Configuration constants
MAX_CONTENT_SIZE_FOR_FULL_PREVIEW = 50 * 1024  # 50KB threshold
PREVIEW_SAMPLE_SIZE = 1024  # 1KB preview for large documents


class ContentValidator:
    """Validates delivered content against user intent - generic and document-aware"""

    def __init__(self, services=None, learningEngine=None):
        self.services = services
        self.learningEngine = learningEngine

    async def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
        """Validates delivered content against user intent using AI (single attempt; parse-or-fail)"""
        return await self._validateWithAI(documents, intent)

    def _analyzeDocuments(self, documents: List[Any]) -> List[Dict[str, Any]]:
        """Generic document analysis - create simple summaries with metadata."""
        summaries = []
        for doc in documents:
            try:
                data = getattr(doc, 'documentData', None)
                name = getattr(doc, 'documentName', 'Unknown')
                mimeType = getattr(doc, 'mimeType', 'unknown')
                formatExt = self._detectFormat(doc)
                sizeInfo = self._calculateSize(doc)

                # Simple preview: if it's dict/list, dump JSON; otherwise use string
                preview = None
                if data is not None:
                    if isinstance(data, (dict, list)):
                        preview = json.dumps(data, indent=2, ensure_ascii=False)
                        # Truncate if too large
                        if len(preview) > MAX_CONTENT_SIZE_FOR_FULL_PREVIEW:
                            preview = preview[:PREVIEW_SAMPLE_SIZE] + f"\n\n[Truncated - {self._formatBytes(sizeInfo['bytes'])} total]"
                    else:
                        text = str(data)
                        if len(text) > MAX_CONTENT_SIZE_FOR_FULL_PREVIEW:
                            preview = text[:PREVIEW_SAMPLE_SIZE] + f"\n\n[Truncated - {self._formatBytes(sizeInfo['bytes'])} total]"
                        else:
                            preview = text

                summary = {
                    "name": name,
                    "mimeType": mimeType,
                    "format": formatExt,
                    "size": sizeInfo["readable"],
                    "preview": preview
                }
                summaries.append(summary)
            except Exception as e:
                logger.warning(f"Error analyzing document {getattr(doc, 'documentName', 'Unknown')}: {str(e)}")
                summaries.append({
                    "name": getattr(doc, 'documentName', 'Unknown'),
                    "mimeType": getattr(doc, 'mimeType', 'unknown'),
                    "format": "unknown",
                    "size": "0 B",
                    "preview": None,
                    "error": str(e)
                })
        return summaries

    def _calculateAvailablePromptSpace(self, basePromptSizeBytes: int) -> int:
        """Calculate available space for document summaries based on model context length."""
        try:
            from modules.aicore.aicoreModelRegistry import modelRegistry
            from modules.aicore.aicoreModelSelector import modelSelector
            from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum

            # Get available models
            availableModels = modelRegistry.getAvailableModels()

            # Create options for PLAN operation (what validation uses)
            options = AiCallOptions(
                operationType=OperationTypeEnum.PLAN,
                priority=None,
                processingMode=None
            )

            # Get failover model list to find the model that will be used
            failoverModels = modelSelector.getFailoverModelList("", "", options, availableModels)

            if not failoverModels:
                # Fallback: assume 16K tokens context (conservative)
                logger.warning("No models available for space calculation, using fallback: 16K tokens")
                maxBytes = 16 * 1024 * 4  # 16K tokens * 4 bytes per token
            else:
                # Use the first (best) model
                model = failoverModels[0]
                # Calculate 80% of context length in bytes (tokens * 4 bytes per token)
                maxBytes = int(model.contextLength * 0.8 * 4)

            # Available space = max - base prompt - safety margin (10%)
            availableBytes = int((maxBytes - basePromptSizeBytes) * 0.9)

            # Ensure minimum available space (at least 1KB)
            availableBytes = max(availableBytes, 1024)

            logger.debug(f"Prompt space calculation: base={basePromptSizeBytes} bytes, max={maxBytes} bytes, available={availableBytes} bytes")

            return availableBytes

        except Exception as e:
            logger.warning(f"Error calculating available prompt space: {str(e)}, using fallback: 8KB")
            # Fallback: assume 8KB available
            return 8 * 1024

    def _analyzeDocumentsWithSizeLimit(self, documents: List[Any], maxTotalBytes: int) -> List[Dict[str, Any]]:
        """Analyze documents with size limit, dividing available space evenly among documents."""
        if not documents:
            return []

        # Reserve space for JSON structure overhead (approximately 200 bytes per document)
        jsonOverheadPerDoc = 200
        reservedOverhead = len(documents) * jsonOverheadPerDoc
        availableForContent = max(0, maxTotalBytes - reservedOverhead)

        # Divide available space evenly among documents
        bytesPerDoc = availableForContent // len(documents) if documents else 0
        # Ensure minimum space per document (at least 100 bytes)
        bytesPerDoc = max(bytesPerDoc, 100)

        logger.debug(f"Document summary space: total={maxTotalBytes} bytes, available={availableForContent} bytes, perDoc={bytesPerDoc} bytes")

        summaries = []
        for doc in documents:
            try:
                data = getattr(doc, 'documentData', None)
                name = getattr(doc, 'documentName', 'Unknown')
                mimeType = getattr(doc, 'mimeType', 'unknown')
                formatExt = self._detectFormat(doc)
                sizeInfo = self._calculateSize(doc)

                # Create preview with size limit
                preview = None
                if data is not None:
                    if isinstance(data, (dict, list)):
                        preview = json.dumps(data, indent=2, ensure_ascii=False)
                    else:
                        preview = str(data)

                    # Truncate preview to fit within bytesPerDoc (accounting for JSON structure)
                    # Estimate: preview takes ~70% of document summary space
                    maxPreviewBytes = int(bytesPerDoc * 0.7)
                    previewBytes = len(preview.encode('utf-8'))

                    if previewBytes > maxPreviewBytes:
                        # Truncate to fit
                        truncated = preview.encode('utf-8')[:maxPreviewBytes]
                        # Try to decode safely
                        try:
                            preview = truncated.decode('utf-8', errors='ignore')
                        except:
                            preview = truncated[:maxPreviewBytes-50].decode('utf-8', errors='ignore')
                        preview += f"\n\n[Truncated - {self._formatBytes(sizeInfo['bytes'])} total]"

                summary = {
                    "name": name,
                    "mimeType": mimeType,
                    "format": formatExt,
                    "size": sizeInfo["readable"],
                    "preview": preview
                }
                summaries.append(summary)
            except Exception as e:
                logger.warning(f"Error analyzing document {getattr(doc, 'documentName', 'Unknown')}: {str(e)}")
                summaries.append({
                    "name": getattr(doc, 'documentName', 'Unknown'),
                    "mimeType": getattr(doc, 'mimeType', 'unknown'),
                    "format": "unknown",
                    "size": "0 B",
                    "preview": None,
                    "error": str(e)
                })

        return summaries

    def _detectFormat(self, doc: Any) -> str:
        """Extract format from filename extension (always use extension)"""
        try:
            docName = getattr(doc, 'documentName', '')

            # Extract from filename extension
            if docName and '.' in docName:
                ext = docName.rsplit('.', 1)[1].lower()
                return ext

            return 'unknown'
        except Exception as e:
            logger.warning(f"Error detecting format: {str(e)}")
            return 'unknown'

    def _calculateSize(self, doc: Any) -> Dict[str, Any]:
        """Calculate document size in bytes and human-readable format"""
        try:
            if not hasattr(doc, 'documentData') or doc.documentData is None:
                return {"bytes": 0, "readable": "0 B"}

            data = doc.documentData
            size_bytes = 0

            if isinstance(data, str):
                size_bytes = len(data.encode('utf-8'))
            elif isinstance(data, bytes):
                size_bytes = len(data)
            elif isinstance(data, (dict, list)):
                # Estimate JSON size
                try:
                    json_str = json.dumps(data)
                    size_bytes = len(json_str.encode('utf-8'))
                except:
                    size_bytes = len(str(data).encode('utf-8'))
            else:
                size_bytes = len(str(data).encode('utf-8'))

            # Convert to human-readable format
            readable = self._formatBytes(size_bytes)

            return {"bytes": size_bytes, "readable": readable}
        except Exception as e:
            logger.warning(f"Error calculating size: {str(e)}")
            return {"bytes": 0, "readable": "0 B"}

    def _formatBytes(self, bytes: int) -> str:
        """Format bytes to human-readable string"""
        for unit in ['B', 'KB', 'MB', 'GB']:
            if bytes < 1024.0:
                return f"{bytes:.1f} {unit}"
            bytes /= 1024.0
        return f"{bytes:.1f} TB"


    def _isFormatCompatible(self, deliveredFormat: str, expectedFormat: str) -> bool:
        """
        Generic format compatibility check.
        - txt/md/html are text formats (compatible with each other)
        - pdf/docx/xlsx are document formats (not compatible with each other)
        - json/xml are structured formats
        - images are image formats
        """
        deliveredLower = deliveredFormat.lower()
        expectedLower = expectedFormat.lower()

        # Exact match
        if deliveredLower == expectedLower:
            return True

        # Text formats are interchangeable
        textFormats = ['txt', 'md', 'html', 'text', 'plain']
        if deliveredLower in textFormats and expectedLower in textFormats:
            return True

        # Structured formats
        if deliveredLower in ['json', 'xml'] and expectedLower in ['json', 'xml']:
            return True

        # Document formats are NOT compatible with each other
        documentFormats = ['pdf', 'docx', 'xlsx', 'pptx']
        if deliveredLower in documentFormats and expectedLower in documentFormats:
            return False  # pdf ≠ docx

        return False

    async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
        """AI-based comprehensive validation - generic approach"""
        try:
            if not hasattr(self, 'services') or not self.services or not hasattr(self.services, 'ai'):
                return self._createFailedValidationResult("AI service not available")

            # Build prompt base WITHOUT document summaries first
            successCriteria = intent.get('successCriteria', [])
            criteriaCount = len(successCriteria)

            promptBase = f"""TASK VALIDATION

USER REQUEST: '{intent.get('primaryGoal', 'Unknown')}'
EXPECTED DATA TYPE: {intent.get('dataType', 'unknown')}
EXPECTED FORMAT: {intent.get('expectedFormat', 'unknown')}
SUCCESS CRITERIA ({criteriaCount} items): {successCriteria}

VALIDATION RULES:
1. Check if delivered documents match expected data type
2. Check if delivered formats are compatible with expected format
3. Verify each success criterion is met based on document content/metadata
4. Check document sizes are reasonable for the task
5. Rate overall quality (0.0-1.0)
6. Identify specific gaps based on what the user requested

OUTPUT FORMAT - JSON ONLY (no prose):
{{
  "overallSuccess": false,
  "qualityScore": 0.0,
  "dataTypeMatch": false,
  "formatMatch": false,
  "documentCount": {len(documents)},
  "successCriteriaMet": {[False] * criteriaCount},
  "gapAnalysis": "Describe what is missing or incorrect",
  "improvementSuggestions": ["General action to improve overall result"],
  "validationDetails": [
    {{
      "documentName": "document.ext",
      "issues": ["Specific problem with this document"],
      "suggestions": ["Specific fix for this document's issues"]
    }}
  ]
}}

Field explanations:
- "improvementSuggestions": Overall actions to improve the entire result (general, high-level)
- "validationDetails[].suggestions": Specific fixes for each document's individual issues (document-specific, detailed)
- Do NOT use prefixes like "NEXT STEP:" - describe actions directly

DELIVERED DOCUMENTS ({len(documents)} items):
"""

            # Calculate available space for document summaries
            # Get the model that will be used for validation
            basePromptSize = len(promptBase.encode('utf-8'))
            availableBytes = self._calculateAvailablePromptSpace(basePromptSize)

            # Analyze documents with size constraints
            documentSummaries = self._analyzeDocumentsWithSizeLimit(documents, availableBytes)

            # Build final prompt with summaries at the end
            documentsJson = json.dumps(documentSummaries, indent=2)
            validationPrompt = promptBase + documentsJson

            # Call AI service for validation
            response = await self.services.ai.callAiPlanning(
                prompt=validationPrompt,
                placeholders=None
            )

            if not response or not response.strip():
                logger.warning("AI validation returned empty response")
                raise ValueError("AI validation failed - empty response")

            # Clean and extract JSON from response
            result = response.strip()
            logger.debug(f"AI validation response length: {len(result)}")

            # Try to find JSON in the response with multiple strategies
            # Strategy 1: Look for JSON in markdown code blocks
            json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result, re.DOTALL)
            if json_match:
                result = json_match.group(1)
                logger.debug(f"Extracted JSON from markdown code block: {result[:200]}...")
            else:
                # Strategy 2: Look for JSON object with proper structure
                json_match = re.search(r'\{[^{}]*"overallSuccess"[^{}]*\}', result, re.DOTALL)
                if not json_match:
                    # Strategy 3: Look for any JSON object
                    json_match = re.search(r'\{.*\}', result, re.DOTALL)

                if json_match:
                    result = json_match.group(0)
                    logger.debug(f"Extracted JSON directly: {result[:200]}...")
                else:
                    logger.debug(f"No JSON found in AI response: {result[:200]}...")
                    logger.debug(f"Full AI response: {result}")
                    raise ValueError("AI validation failed - no JSON in response")

            try:
                aiResult = json.loads(result)
                logger.info("AI validation JSON parsed successfully")

                overall = aiResult.get("overallSuccess")
                quality = aiResult.get("qualityScore")
                details = aiResult.get("validationDetails")
                gap = aiResult.get("gapAnalysis", "")
                criteria = aiResult.get("successCriteriaMet")
                improvements = aiResult.get("improvementSuggestions", [])

                # Normalize while keeping failures explicit
                normalized = {
                    "overallSuccess": overall if isinstance(overall, bool) else None,
                    "qualityScore": float(quality) if isinstance(quality, (int, float)) else None,
                    "documentCount": len(documentSummaries),
                    "validationDetails": details if isinstance(details, list) else [{
                        "documentName": "AI Validation",
                        "gapAnalysis": gap,
                        "successCriteriaMet": criteria if isinstance(criteria, list) else []
                    }],
                    "improvementSuggestions": improvements,
                    "schemaCompliant": True,
                    "originalType": "json",
                    "missingFields": []
                }

                if normalized["overallSuccess"] is None:
                    normalized["missingFields"].append("overallSuccess")
                if normalized["qualityScore"] is None:
                    normalized["missingFields"].append("qualityScore")
                if normalized["missingFields"]:
                    normalized["schemaCompliant"] = False

                return normalized

            except json.JSONDecodeError as json_error:
                logger.warning(f"AI validation invalid JSON: {str(json_error)}")
                logger.debug(f"JSON content: {result}")
                raise

            raise ValueError("AI validation failed - no response")

        except Exception as e:
            logger.error(f"AI validation failed: {str(e)}")
            raise

    def _createFailedValidationResult(self, errorMessage: str) -> Dict[str, Any]:
        """Create a standardized failed validation result"""
        return {
            "overallSuccess": False,
            "qualityScore": 0.0,
            "dataTypeMatch": False,
            "formatMatch": False,
            "documentCount": 0,
            "successCriteriaMet": [],
            "gapAnalysis": errorMessage,
            "improvementSuggestions": [],
            "validationDetails": [],
            "schemaCompliant": True,
            "originalType": "error",
            "missingFields": [],
            "error": errorMessage
        }