gateway/modules/workflows/processing/adaptive/contentValidator.py

# contentValidator.py
# Content validation for adaptive React mode

import re
import logging
import json
from typing import List, Dict, Any

logger = logging.getLogger(__name__)

class ContentValidator:
    """Validates delivered content against user intent"""

    def __init__(self):
        pass

    def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
        """Validates delivered content against user intent using AI"""
        try:
            # First, try AI-based validation for intelligent gap analysis
            aiValidation = self._validateWithAI(documents, intent)
            if aiValidation:
                return aiValidation

            # Fallback to rule-based validation if AI validation fails
            validationDetails = []

            for doc in documents:
                content = self._extractContent(doc)
                detail = self._validateSingleDocument(content, doc, intent)
                validationDetails.append(detail)

            # Calculate overall success
            overallSuccess = all(detail.get("successCriteriaMet", [False]) for detail in validationDetails)

            # Calculate quality score
            qualityScore = self._calculateQualityScore(validationDetails)

            # Generate improvement suggestions
            improvementSuggestions = self._generateImprovementSuggestions(validationDetails, intent)

            return {
                "overallSuccess": overallSuccess,
                "qualityScore": qualityScore,
                "validationDetails": validationDetails,
                "improvementSuggestions": improvementSuggestions
            }

        except Exception as e:
            logger.error(f"Error validating content: {str(e)}")
            return self._createFailedValidationResult(str(e))

    def _extractContent(self, doc: Any) -> str:
        """Extracts content from a document"""
        try:
            if hasattr(doc, 'documentData'):
                data = doc.documentData
                if isinstance(data, dict) and 'content' in data:
                    return str(data['content'])
                else:
                    return str(data)
            return ""
        except Exception:
            return ""

    def _validateSingleDocument(self, content: str, doc: Any, intent: Dict[str, Any]) -> Dict[str, Any]:
        """Validates a single document against intent"""
        # Check data type match
        dataTypeMatch = self._checkDataTypeMatch(content, intent.get("dataType", "unknown"))

        # Check format match
        formatMatch = self._checkFormatMatch(content, intent.get("expectedFormat", "unknown"))

        # Calculate quality score
        qualityScore = self._calculateDocumentQualityScore(content, intent)

        # Check success criteria
        successCriteriaMet = self._checkSuccessCriteria(content, intent)

        # Identify specific issues
        specificIssues = self._identifySpecificIssues(content, intent)

        # Generate improvement suggestions
        improvementSuggestions = self._generateDocumentImprovementSuggestions(content, intent)

        return {
            "documentName": getattr(doc, 'documentName', 'Unknown'),
            "dataTypeMatch": dataTypeMatch,
            "formatMatch": formatMatch,
            "qualityScore": qualityScore,
            "successCriteriaMet": successCriteriaMet,
            "specificIssues": specificIssues,
            "improvementSuggestions": improvementSuggestions
        }

    def _checkDataTypeMatch(self, content: str, dataType: str) -> bool:
        """Checks if content matches the expected data type"""
        if dataType == "numbers":
            return self._containsNumbers(content)
        elif dataType == "text":
            return self._containsText(content)
        elif dataType == "documents":
            return self._containsDocumentContent(content)
        elif dataType == "analysis":
            return self._containsAnalysis(content)
        elif dataType == "code":
            return self._containsCode(content)
        else:
            return True  # Unknown type, assume match

    def _containsNumbers(self, content: str) -> bool:
        """Checks if content contains actual numbers (not code)"""
        # Look for actual numbers in the content
        numbers = re.findall(r'\b\d+\b', content)

        # Check if it's code (contains function definitions, etc.)
        isCode = any(keyword in content.lower() for keyword in [
            'def ', 'function', 'import ', 'class ', 'for ', 'while ', 'if ',
            'return', 'print(', 'console.log', 'public ', 'private '
        ])

        # If it's code, it doesn't contain actual numbers
        if isCode:
            return False

        # If it has numbers and it's not code, it contains actual numbers
        return len(numbers) > 0

    def _containsText(self, content: str) -> bool:
        """Checks if content contains readable text"""
        # Remove numbers and special characters
        textContent = re.sub(r'[^\w\s]', '', content)
        words = textContent.split()

        # Check if there are enough words to be considered text
        return len(words) > 5

    def _containsDocumentContent(self, content: str) -> bool:
        """Checks if content is suitable for document creation"""
        # Check for structured content
        hasStructure = any(indicator in content for indicator in [
            '\n', '\t', '|', '-', '*', '1.', '2.', '•', '◦'
        ])

        # Check for meaningful content
        hasMeaningfulContent = len(content.strip()) > 50

        return hasStructure and hasMeaningfulContent

    def _containsAnalysis(self, content: str) -> bool:
        """Checks if content contains analysis"""
        analysisIndicators = [
            'analysis', 'findings', 'conclusion', 'summary', 'insights',
            'trends', 'patterns', 'comparison', 'evaluation', 'assessment'
        ]

        contentLower = content.lower()
        return any(indicator in contentLower for indicator in analysisIndicators)

    def _containsCode(self, content: str) -> bool:
        """Checks if content contains code"""
        codeIndicators = [
            'def ', 'function', 'import ', 'class ', 'for ', 'while ', 'if ',
            'return', 'print(', 'console.log', 'public ', 'private ', 'void ',
            'int ', 'string ', 'var ', 'let ', 'const '
        ]

        contentLower = content.lower()
        return any(indicator in contentLower for indicator in codeIndicators)

    def _checkFormatMatch(self, content: str, expectedFormat: str) -> bool:
        """Checks if content matches expected format"""
        if expectedFormat == "raw_data":
            # Raw data should be simple, not heavily formatted
            return not any(indicator in content for indicator in [
                '<html>', '<div>', '<table>', '## ', '### ', '**', '__'
            ])
        elif expectedFormat == "formatted":
            # Formatted content should have structure
            return any(indicator in content for indicator in [
                '\n', '\t', '|', '-', '*', '1.', '2.', '•'
            ])
        elif expectedFormat == "structured":
            # Structured content should have clear organization
            return any(indicator in content for indicator in [
                '{', '}', '[', ']', '|', '\t', '  '
            ])
        else:
            return True  # Unknown format, assume match

    def _checkSuccessCriteria(self, content: str, intent: Dict[str, Any]) -> List[bool]:
        """Checks if content meets success criteria"""
        criteriaMet = []
        successCriteria = intent.get("successCriteria", [])

        for criterion in successCriteria:
            if 'prime numbers' in criterion.lower():
                # Check if content contains actual prime numbers, not code
                hasNumbers = bool(re.search(r'\b\d+\b', content))
                isNotCode = not any(keyword in content.lower() for keyword in [
                    'def ', 'function', 'import ', 'class '
                ])
                criteriaMet.append(hasNumbers and isNotCode)
            elif 'document' in criterion.lower():
                # Check if content is suitable for document creation
                hasStructure = any(indicator in content for indicator in [
                    '\n', '\t', '|', '-', '*', '1.', '2.'
                ])
                criteriaMet.append(hasStructure)
            elif 'format' in criterion.lower():
                # Check if content is properly formatted
                hasFormatting = any(indicator in content for indicator in [
                    '\n', '\t', '|', '-', '*', '1.', '2.', '•'
                ])
                criteriaMet.append(hasFormatting)
            else:
                # Generic check - content should not be empty
                criteriaMet.append(len(content.strip()) > 0)

        return criteriaMet

    def _calculateDocumentQualityScore(self, content: str, intent: Dict[str, Any]) -> float:
        """Calculates quality score for a single document"""
        score = 0.0

        # Base score for having content
        if len(content.strip()) > 0:
            score += 0.2

        # Score for data type match
        if self._checkDataTypeMatch(content, intent.get("dataType", "unknown")):
            score += 0.3

        # Score for format match
        if self._checkFormatMatch(content, intent.get("expectedFormat", "unknown")):
            score += 0.2

        # Score for success criteria
        successCriteriaMet = self._checkSuccessCriteria(content, intent)
        if successCriteriaMet:
            successRate = sum(successCriteriaMet) / len(successCriteriaMet)
            score += 0.3 * successRate

        return min(score, 1.0)

    def _calculateQualityScore(self, validationDetails: List[Dict[str, Any]]) -> float:
        """Calculates overall quality score from validation details"""
        if not validationDetails:
            return 0.0

        totalScore = sum(detail.get("qualityScore", 0) for detail in validationDetails)
        return totalScore / len(validationDetails)

    def _identifySpecificIssues(self, content: str, intent: Dict[str, Any]) -> List[str]:
        """Identifies specific issues with the content"""
        issues = []

        # Check for common issues
        if intent.get("dataType") == "numbers" and self._containsCode(content):
            issues.append("Content contains code instead of actual numbers")

        if intent.get("expectedFormat") == "raw_data" and any(indicator in content for indicator in ['<html>', '## ', '**']):
            issues.append("Content is formatted when raw data was requested")

        if len(content.strip()) == 0:
            issues.append("Content is empty")

        return issues

    def _generateDocumentImprovementSuggestions(self, content: str, intent: Dict[str, Any]) -> List[str]:
        """Generates improvement suggestions for a single document"""
        suggestions = []

        dataType = intent.get("dataType", "unknown")
        expectedFormat = intent.get("expectedFormat", "unknown")

        if dataType == "numbers" and self._containsCode(content):
            suggestions.append("Deliver actual numbers, not code to generate them")

        if expectedFormat == "raw_data" and any(indicator in content for indicator in ['<html>', '## ']):
            suggestions.append("Provide raw data without formatting")

        if len(content.strip()) == 0:
            suggestions.append("Provide actual content")

        return suggestions

    def _generateImprovementSuggestions(self, validationDetails: List[Dict[str, Any]],
                                      intent: Dict[str, Any]) -> List[str]:
        """Generates improvement suggestions based on validation results"""
        suggestions = []

        # Check for common issues
        if not any(detail.get("dataTypeMatch", False) for detail in validationDetails):
            dataType = intent.get("dataType", "unknown")
            suggestions.append(f"Content should contain {dataType} data, not code or other formats")

        if not any(detail.get("formatMatch", False) for detail in validationDetails):
            expectedFormat = intent.get("expectedFormat", "unknown")
            suggestions.append(f"Content should be in {expectedFormat} format")

        # Add specific suggestions from validation details
        for detail in validationDetails:
            suggestions.extend(detail.get("improvementSuggestions", []))

        return list(set(suggestions))  # Remove duplicates

    def _createFailedValidationResult(self, error: str) -> Dict[str, Any]:
        """Creates a failed validation result"""
        return {
            "overallSuccess": False,
            "qualityScore": 0.0,
            "validationDetails": [],
            "improvementSuggestions": [f"Validation failed: {error}"]
        }

    def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
        """AI-based validation to intelligently assess task completion"""
        try:
            # Extract content from all documents
            documentContents = []
            for doc in documents:
                content = self._extractContent(doc)
                documentContents.append({
                    "name": getattr(doc, 'documentName', 'Unknown'),
                    "content": content[:2000]  # Limit content for AI processing
                })

            # Create AI validation prompt
            validationPrompt = f"""
You are a task completion validator. Analyze if the delivered content actually fulfills the user's request.

USER REQUEST: {intent.get('primaryGoal', 'Unknown')}

DELIVERED CONTENT:
{json.dumps(documentContents, indent=2)}

TASK: Determine if the user's request has been fully completed.

Analyze the gap between what was requested and what was delivered. Consider any missing elements, incorrect formats, incomplete work, or other discrepancies.

Respond with JSON only:
{{
    "overallSuccess": true/false,
    "qualityScore": 0.0-1.0,
    "gapAnalysis": "Detailed analysis of what's missing or incorrect",
    "improvementSuggestions": ["specific action 1", "specific action 2"]
}}
"""

            # Call AI service for validation
            from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
            request_options = AiCallOptions()
            request_options.operationType = OperationType.GENERAL

            request = AiCallRequest(prompt=validationPrompt, context="", options=request_options)

            # Get AI service from the workflow context
            if hasattr(self, 'services') and hasattr(self.services, 'ai'):
                response = self.services.ai.aiObjects.call(request)
                if response and response.content:
                    import re
                    result = response.content.strip()
                    json_match = re.search(r'\{.*\}', result, re.DOTALL)
                    if json_match:
                        result = json_match.group(0)

                    aiResult = json.loads(result)

                    return {
                        "overallSuccess": aiResult.get("overallSuccess", False),
                        "qualityScore": aiResult.get("qualityScore", 0.0),
                        "validationDetails": [{
                            "documentName": "AI Validation",
                            "gapAnalysis": aiResult.get("gapAnalysis", ""),
                            "successCriteriaMet": [aiResult.get("overallSuccess", False)]
                        }],
                        "improvementSuggestions": aiResult.get("improvementSuggestions", [])
                    }

            return None  # Fallback to rule-based validation

        except Exception as e:
            logger.error(f"AI validation failed: {str(e)}")
            return None  # Fallback to rule-based validation