gateway/modules/workflows/processing/adaptive/contentValidator.py

# contentValidator.py
# Content validation for adaptive React mode

import logging
import json
import re
from typing import List, Dict, Any

logger = logging.getLogger(__name__)

class ContentValidator:
    """Validates delivered content against user intent"""

    def __init__(self, services=None, learningEngine=None):
        self.services = services
        self.learningEngine = learningEngine

    async def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
        """Validates delivered content against user intent using AI (single attempt; parse-or-fail)"""
        return await self._validateWithAI(documents, intent)

    def _extractContent(self, doc: Any) -> str:
        """Extracts content from a document with size protection for large documents"""
        try:
            if hasattr(doc, 'documentData'):
                data = doc.documentData
                if isinstance(data, dict) and 'content' in data:
                    content = data['content']
                    # For large content, check size before converting to string
                    if hasattr(content, '__len__') and len(str(content)) > 100000:  # 100KB threshold
                        # For very large content, return a size indicator instead
                        return f"[Large document content - {len(str(content))} characters - truncated for validation]"
                    return str(content)
                else:
                    content = data
                    # For large content, check size before converting to string
                    if hasattr(content, '__len__') and len(str(content)) > 100000:  # 100KB threshold
                        return f"[Large document content - {len(str(content))} characters - truncated for validation]"
                    return str(content)
            return ""
        except Exception:
            return ""

    # Removed schema fallback creator to keep failures explicit

    def _isValidJsonResponse(self, response: str) -> bool:
        """Checks if response contains valid JSON structure"""
        try:
            import re
            # Look for JSON with expected structure
            json_match = re.search(r'\{[^{}]*"overallSuccess"[^{}]*\}', response, re.DOTALL)
            if json_match:
                json.loads(json_match.group(0))
                return True
            return False
        except:
            return False

    # Removed text-based fallback extraction to avoid hiding issues

    async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
        """AI-based comprehensive validation - single main function"""
        try:
            if not hasattr(self, 'services') or not self.services or not hasattr(self.services, 'ai'):
                return self._createFailedValidationResult("AI service not available")

            # Extract content from all documents
            documentContents = []
            for doc in documents:
                content = self._extractContent(doc)
                documentContents.append({
                    "name": getattr(doc, 'documentName', 'Unknown'),
                    "content": content[:2000]  # Limit content for AI processing
                })

            # Create structured AI validation prompt
            successCriteria = intent.get('successCriteria', [])
            criteriaCount = len(successCriteria)

            validationPrompt = f"""TASK VALIDATION

USER REQUEST: '{intent.get('primaryGoal', 'Unknown')}'
EXPECTED TYPE: {intent.get('dataType', 'unknown')}
EXPECTED FORMAT: {intent.get('expectedFormat', 'unknown')}
SUCCESS CRITERIA ({criteriaCount} items): {successCriteria}

VALIDATION RULES:
1. Check if content matches expected data type
2. Check if content matches expected format
3. Verify each success criterion is met
4. Rate overall quality (0.0-1.0)
5. Identify specific gaps
6. Suggest next steps

OUTPUT FORMAT - JSON ONLY (no prose):
{{
  "overallSuccess": false,
  "qualityScore": 0.0,
  "dataTypeMatch": false,
  "formatMatch": false,
  "successCriteriaMet": {[False] * criteriaCount},
  "gapAnalysis": "Specific gaps found",
  "improvementSuggestions": ["NEXT STEP: Action 1", "NEXT STEP: Action 2"],
  "validationDetails": [
    {{
      "documentName": "Document Name",
      "issues": ["Issue 1", "Issue 2"],
      "suggestions": ["NEXT STEP: Fix 1", "NEXT STEP: Fix 2"]
    }}
  ]
}}

DELIVERED CONTENT TO CHECK:
{json.dumps(documentContents, indent=2)}

"""

            # Call AI service for validation
            from modules.datamodels.datamodelAi import AiCallOptions, OperationType
            request_options = AiCallOptions()
            request_options.operationType = OperationType.GENERAL

            response = await self.services.ai.callAi(
                prompt=validationPrompt,
                documents=None,
                options=request_options
            )

            # No retries or correction prompts here; parse-or-fail below

            if not response or not response.strip():
                logger.warning("AI validation returned empty response")
                raise ValueError("AI validation failed - empty response")

            # Clean and extract JSON from response
            result = response.strip()
            logger.debug(f"AI validation response length: {len(result)}")

            # Try to find JSON in the response with multiple strategies
            import re

            # Strategy 1: Look for JSON in markdown code blocks
            json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result, re.DOTALL)
            if json_match:
                result = json_match.group(1)
                logger.debug(f"Extracted JSON from markdown code block: {result[:200]}...")
            else:
                # Strategy 2: Look for JSON object with proper structure
                json_match = re.search(r'\{[^{}]*"overallSuccess"[^{}]*\}', result, re.DOTALL)
                if not json_match:
                    # Strategy 3: Look for any JSON object
                    json_match = re.search(r'\{.*\}', result, re.DOTALL)

                if json_match:
                    result = json_match.group(0)
                    logger.debug(f"Extracted JSON directly: {result[:200]}...")
                else:
                    logger.debug(f"No JSON found in AI response: {result[:200]}...")
                    logger.debug(f"Full AI response: {result}")
                    raise ValueError("AI validation failed - no JSON in response")

            try:
                aiResult = json.loads(result)
                logger.info("AI validation JSON parsed successfully")

                overall = aiResult.get("overallSuccess")
                quality = aiResult.get("qualityScore")
                details = aiResult.get("validationDetails")
                gap = aiResult.get("gapAnalysis", "")
                criteria = aiResult.get("successCriteriaMet")
                improvements = aiResult.get("improvementSuggestions", [])

                # Normalize while keeping failures explicit
                normalized = {
                    "overallSuccess": overall if isinstance(overall, bool) else None,
                    "qualityScore": float(quality) if isinstance(quality, (int, float)) else None,
                    "validationDetails": details if isinstance(details, list) else [{
                        "documentName": "AI Validation",
                        "gapAnalysis": gap,
                        "successCriteriaMet": criteria if isinstance(criteria, list) else []
                    }],
                    "improvementSuggestions": improvements,
                    "schemaCompliant": True,
                    "originalType": "json",
                    "missingFields": []
                }

                if normalized["overallSuccess"] is None:
                    normalized["missingFields"].append("overallSuccess")
                if normalized["qualityScore"] is None:
                    normalized["missingFields"].append("qualityScore")
                if normalized["missingFields"]:
                    normalized["schemaCompliant"] = False

                return normalized

            except json.JSONDecodeError as json_error:
                logger.warning(f"AI validation invalid JSON: {str(json_error)}")
                logger.debug(f"JSON content: {result}")
                raise

            raise ValueError("AI validation failed - no response")

        except Exception as e:
            logger.error(f"AI validation failed: {str(e)}")
            raise