From 1d347eb15a99d1be3c998fbc044a2f410542463d Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Wed, 15 Oct 2025 12:41:02 +0200
Subject: [PATCH] adapted ai chat validation

---
 .../mainServiceGeneration.py                  |   8 +-
 .../renderers/rendererHtml.py                 |  18 +++
 .../serviceWorkflow/mainServiceWorkflow.py    | 124 +++++++++++++-----
 .../processing/adaptive/contentValidator.py   |  69 +++++++---
 .../processing/adaptive/progressTracker.py    |  23 +++-
 .../workflows/processing/modes/modeReact.py   |  11 +-
 6 files changed, 195 insertions(+), 58 deletions(-)

diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py
index 8ed6423b..d38cea96 100644
--- a/modules/services/serviceGeneration/mainServiceGeneration.py
+++ b/modules/services/serviceGeneration/mainServiceGeneration.py
@@ -323,7 +323,7 @@ class GenerationService:
             try:
                 debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
                 if debug_enabled:
-                    import os
+                    import os, json
                     ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
                     debug_root = "./test-chat/ai"
                     debug_dir = os.path.join(debug_root, f"render_input_{ts}")
@@ -332,6 +332,12 @@ class GenerationService:
                         f.write(f"title: {title}\nformat: {outputFormat}\ncontent_type: {type(extractedContent).__name__}\n")
                         f.write(f"content_size: {len(str(extractedContent))} characters\n")
                         f.write(f"sections_count: {len(extractedContent.get('sections', []))}\n")
+                    # Also write the extracted content JSON for inspection
+                    try:
+                        with open(os.path.join(debug_dir, "extracted_content.json"), "w", encoding="utf-8") as jf:
+                            json.dump(extractedContent, jf, ensure_ascii=False, indent=2)
+                    except Exception:
+                        pass
             except Exception:
                 pass
 
diff --git a/modules/services/serviceGeneration/renderers/rendererHtml.py b/modules/services/serviceGeneration/renderers/rendererHtml.py
index 1dedaf46..1b202886 100644
--- a/modules/services/serviceGeneration/renderers/rendererHtml.py
+++ b/modules/services/serviceGeneration/renderers/rendererHtml.py
@@ -363,6 +363,15 @@ class RendererHtml(BaseRenderer):
     def _render_json_heading(self, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
         """Render a JSON heading to HTML using AI-generated styles."""
         try:
+            # Normalize non-dict inputs
+            if isinstance(heading_data, str):
+                heading_data = {"text": heading_data, "level": 2}
+            elif isinstance(heading_data, list):
+                # Render a list as bullet list under a default heading label
+                return self._render_json_bullet_list({"items": heading_data}, styles)
+            elif not isinstance(heading_data, dict):
+                return ""
+            
             level = heading_data.get("level", 1)
             text = heading_data.get("text", "")
             
@@ -379,6 +388,15 @@ class RendererHtml(BaseRenderer):
     def _render_json_paragraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> str:
         """Render a JSON paragraph to HTML using AI-generated styles."""
         try:
+            # Normalize non-dict inputs
+            if isinstance(paragraph_data, str):
+                paragraph_data = {"text": paragraph_data}
+            elif isinstance(paragraph_data, list):
+                # Treat list as bullet list paragraph
+                return self._render_json_bullet_list({"items": paragraph_data}, styles)
+            elif not isinstance(paragraph_data, dict):
+                return ""
+            
             text = paragraph_data.get("text", "")
             
             if text:
diff --git a/modules/services/serviceWorkflow/mainServiceWorkflow.py b/modules/services/serviceWorkflow/mainServiceWorkflow.py
index afc4e3b5..dba44e80 100644
--- a/modules/services/serviceWorkflow/mainServiceWorkflow.py
+++ b/modules/services/serviceWorkflow/mainServiceWorkflow.py
@@ -79,14 +79,7 @@ class WorkflowService:
         """Get ChatDocuments from a list of document references using all three formats."""
         try:
             workflow = self.services.currentWorkflow
-            
-            # Reload workflow from database to ensure we have all messages
-            if hasattr(workflow, 'id'):
-                try:
-                    workflow = self.getWorkflow(workflow.id)
-                    logger.debug(f"Reloaded workflow {workflow.id} with {len(workflow.messages)} messages")
-                except Exception as e:
-                    logger.warning(f"Could not reload workflow from database: {str(e)}")
+            logger.debug(f"getChatDocumentsFromDocumentList: currentWorkflow.id = {workflow.id if workflow and hasattr(workflow, 'id') else 'NO_ID'}")
             
             all_documents = []
             for doc_ref in documentList:
@@ -497,15 +490,32 @@ class WorkflowService:
     def getWorkflow(self, workflowId: str):
         """Get workflow by ID by delegating to the chat interface"""
         try:
-            return self.interfaceDbChat.getWorkflow(workflowId)
+            logger.debug(f"getWorkflow called with workflowId: {workflowId}")
+            result = self.interfaceDbChat.getWorkflow(workflowId)
+            if result:
+                logger.debug(f"getWorkflow returned workflow with ID: {result.id}")
+            else:
+                logger.warning(f"getWorkflow returned None for workflowId: {workflowId}")
+            return result
         except Exception as e:
             logger.error(f"Error getting workflow: {str(e)}")
             raise
 
     def createMessage(self, messageData: Dict[str, Any]):
-        """Create a new message by delegating to the chat interface"""
+        """Create a new message by delegating to the chat interface and append to in-memory workflow."""
         try:
-            return self.interfaceDbChat.createMessage(messageData)
+            message = self.interfaceDbChat.createMessage(messageData)
+            try:
+                # Keep in-memory workflow messages in sync
+                workflow = getattr(self.services, 'currentWorkflow', None)
+                if workflow and hasattr(workflow, 'messages') and message:
+                    # Avoid duplicates if same message was already appended
+                    if not any(getattr(m, 'id', None) == getattr(message, 'id', None) for m in workflow.messages):
+                        workflow.messages.append(message)
+            except Exception:
+                # Never fail if local append has issues
+                pass
+            return message
         except Exception as e:
             logger.error(f"Error creating message: {str(e)}")
             raise
@@ -519,9 +529,24 @@ class WorkflowService:
             raise
 
     def createLog(self, logData: Dict[str, Any]):
-        """Create a new log entry by delegating to the chat interface"""
+        """Create a new log entry by delegating to the chat interface and append to in-memory workflow logs."""
         try:
-            return self.interfaceDbChat.createLog(logData)
+            log_entry = self.interfaceDbChat.createLog(logData)
+            try:
+                workflow = getattr(self.services, 'currentWorkflow', None)
+                if workflow and hasattr(workflow, 'logs') and log_entry:
+                    # Avoid duplicates by id if present, else compare message+timestamp tuple
+                    get_id = getattr(log_entry, 'id', None)
+                    if get_id is not None:
+                        if not any(getattr(l, 'id', None) == get_id for l in workflow.logs):
+                            workflow.logs.append(log_entry)
+                    else:
+                        key = (getattr(log_entry, 'message', None), getattr(log_entry, 'publishedAt', None))
+                        if not any((getattr(l, 'message', None), getattr(l, 'publishedAt', None)) == key for l in workflow.logs):
+                            workflow.logs.append(log_entry)
+            except Exception:
+                pass
+            return log_entry
         except Exception as e:
             logger.error(f"Error creating log: {str(e)}")
             raise
@@ -611,6 +636,31 @@ class WorkflowService:
             # Get document reference list using the exact same logic as old system
             document_list = self._getDocumentReferenceList(workflow)
             
+            # Optional: dump a concise document index for debugging
+            try:
+                debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
+                if debug_enabled:
+                    import os, json
+                    from datetime import datetime, UTC
+                    ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
+                    debug_root = "./test-chat/ai"
+                    os.makedirs(debug_root, exist_ok=True)
+                    doc_index = []
+                    for bucket in ("chat", "history"):
+                        for ex in document_list.get(bucket, []) or []:
+                            doc_index.append({
+                                "bucket": bucket,
+                                "label": ex.get("documentsLabel"),
+                                "documents": ex.get("documents", [])
+                            })
+                    with open(os.path.join(debug_root, f"{ts}_available_documents_index.json"), "w", encoding="utf-8") as f:
+                        json.dump({
+                            "workflowId": getattr(workflow, 'id', None),
+                            "index": doc_index
+                        }, f, ensure_ascii=False, indent=2)
+            except Exception:
+                pass
+            
             # Build index string for AI action planning
             context = ""
             
@@ -691,47 +741,50 @@ class WorkflowService:
         if all_documents:
             self._refreshDocumentFileAttributes(all_documents)
         
+        def _is_valid_document(doc) -> bool:
+            try:
+                size_ok = getattr(doc, 'fileSize', 0) and getattr(doc, 'fileSize', 0) > 0
+                id_ok = bool(getattr(doc, 'fileId', None))
+                mime_ok = bool(getattr(doc, 'mimeType', None))
+                return size_ok and id_ok and mime_ok
+            except Exception:
+                return False
+        
         chat_exchanges = []
         history_exchanges = []
         
-        # Process messages in reverse order; "first" marks boundary
         in_current_round = True
         for message in reversed(workflow.messages):
             is_first = message.status == "first" if hasattr(message, 'status') else False
             
-            # Build a DocumentExchange if message has documents and an explicit documentsLabel
             doc_exchange = None
             if message.documents:
                 existing_label = getattr(message, 'documentsLabel', None)
                 if existing_label:
-                    # Validate and use the message's actual documentsLabel
                     validated_label = self._validateDocumentLabelConsistency(message)
                     doc_refs = []
                     for doc in message.documents:
+                        if not _is_valid_document(doc):
+                            # Skip empty/invalid docs
+                            continue
                         doc_ref = self._getDocumentReferenceFromChatDocument(doc, message)
                         doc_refs.append(doc_ref)
-                    doc_exchange = {
-                        'documentsLabel': validated_label,
-                        'documents': doc_refs
-                    }
-                # IMPORTANT: Never synthesize new labels here. If a message lacks
-                # a documentsLabel, we skip adding an exchange for it.
+                    if doc_refs:
+                        doc_exchange = {
+                            'documentsLabel': validated_label,
+                            'documents': doc_refs
+                        }
             
-            # Append to appropriate container based on boundary
             if doc_exchange:
                 if in_current_round:
                     chat_exchanges.append(doc_exchange)
                 else:
                     history_exchanges.append(doc_exchange)
             
-            # Flip boundary after including the "first" message in chat
             if in_current_round and is_first:
                 in_current_round = False
         
-        # Sort by recency: most recent first, then current round, then earlier rounds
-        # Sort chat exchanges by message sequence number (most recent first)
         chat_exchanges.sort(key=lambda x: self._getMessageSequenceForExchange(x, workflow), reverse=True)
-        # Sort history exchanges by message sequence number (most recent first)
         history_exchanges.sort(key=lambda x: self._getMessageSequenceForExchange(x, workflow), reverse=True)
         
         return {
@@ -743,11 +796,16 @@ class WorkflowService:
         """Update file attributes (fileName, fileSize, mimeType) for documents"""
         for doc in documents:
             try:
-                # Debug: Log original filename before refresh
                 original_filename = doc.fileName
                 logger.debug(f"Before refresh - Document {doc.id}: fileName='{original_filename}' (length: {len(original_filename)})")
                 
-                # Use the proper WorkflowService method to get file info
+                # Skip invalid docs early if essential identifiers are missing
+                if not getattr(doc, 'fileId', None):
+                    logger.debug(f"Skipping document {doc.id} due to missing fileId")
+                    setattr(doc, 'fileSize', 0)
+                    setattr(doc, 'mimeType', None)
+                    continue
+                
                 file_info = self.getFileInfo(doc.fileId)
                 if file_info:
                     db_filename = file_info.get("fileName", doc.fileName)
@@ -757,10 +815,16 @@ class WorkflowService:
                     doc.fileSize = file_info.get("size", doc.fileSize)
                     doc.mimeType = file_info.get("mimeType", doc.mimeType)
                     
-                    # Debug: Log final filename after refresh
+                    # Mark invalid if missing mimeType
+                    if not doc.mimeType:
+                        logger.debug(f"Document {doc.id} has missing mimeType; will be filtered from index")
+                        setattr(doc, 'fileSize', 0)
+                    
                     logger.debug(f"After refresh - Document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
                 else:
                     logger.warning(f"File not found for document {doc.id}, fileId: {doc.fileId}")
+                    setattr(doc, 'fileSize', 0)
+                    setattr(doc, 'mimeType', None)
             except Exception as e:
                 logger.error(f"Error refreshing file attributes for document {doc.id}: {e}")
 
diff --git a/modules/workflows/processing/adaptive/contentValidator.py b/modules/workflows/processing/adaptive/contentValidator.py
index 0279df90..5423dc8e 100644
--- a/modules/workflows/processing/adaptive/contentValidator.py
+++ b/modules/workflows/processing/adaptive/contentValidator.py
@@ -38,12 +38,15 @@ class ContentValidator:
             return ""
     
     def _createFailedValidationResult(self, error: str) -> Dict[str, Any]:
-        """Creates a failed validation result"""
+        """Creates a failed validation result in a schema-stable shape"""
         return {
-            "overallSuccess": False,
-            "qualityScore": 0.0,
+            "overallSuccess": None,  # Unknown when validator itself failed
+            "qualityScore": None,
             "validationDetails": [],
-            "improvementSuggestions": [f"NEXT STEP: Fix validation error - {error}. Check system logs for more details and retry the operation."]
+            "improvementSuggestions": [f"NEXT STEP: Fix validation error - {error}. Check system logs for more details and retry the operation."],
+            "schemaCompliant": False,
+            "originalType": "error",
+            "missingFields": ["overallSuccess", "qualityScore"],
         }
     
     def _isValidJsonResponse(self, response: str) -> bool:
@@ -60,7 +63,7 @@ class ContentValidator:
             return False
     
     def _extractFallbackValidationResult(self, response: str) -> Dict[str, Any]:
-        """Extracts validation result from malformed AI response"""
+        """Extracts a minimal validation result from a malformed AI response (schema-stable)"""
         try:
             import re
             
@@ -79,16 +82,23 @@ class ContentValidator:
                 else:
                     overall_success = False
             
-            return {
-                "overallSuccess": overall_success if isinstance(overall_success, bool) else (overall_success.group(1).lower() == 'true' if overall_success else False),
-                "qualityScore": float(quality_score.group(1)) if quality_score else 0.5,
+            parsed_overall = overall_success if isinstance(overall_success, bool) else (overall_success.group(1).lower() == 'true' if overall_success else None)
+            parsed_quality = float(quality_score.group(1)) if quality_score else None
+
+            result = {
+                "overallSuccess": parsed_overall,
+                "qualityScore": parsed_quality,
                 "validationDetails": [{
                     "documentName": "AI Validation (Fallback)",
                     "gapAnalysis": gap_analysis.group(1) if gap_analysis else "Unable to parse detailed analysis",
-                    "successCriteriaMet": [False]  # Conservative fallback
+                    "successCriteriaMet": []
                 }],
-                "improvementSuggestions": ["NEXT STEP: AI response was malformed - retry the operation for better results"]
+                "improvementSuggestions": ["NEXT STEP: AI response was malformed - retry the operation for better results"],
+                "schemaCompliant": False,
+                "originalType": "text",
+                "missingFields": [k for k, v in {"overallSuccess": parsed_overall, "qualityScore": parsed_quality}.items() if v is None],
             }
+            return result
         except Exception as e:
             logger.error(f"Fallback extraction failed: {str(e)}")
             return None
@@ -241,17 +251,38 @@ RESPOND WITH THIS EXACT JSON FORMAT - NO OTHER TEXT:
             try:
                 aiResult = json.loads(result)
                 logger.info("AI validation JSON parsed successfully")
-                
-                return {
-                    "overallSuccess": aiResult.get("overallSuccess", False),
-                    "qualityScore": aiResult.get("qualityScore", 0.0),
-                    "validationDetails": aiResult.get("validationDetails", [{
+
+                overall = aiResult.get("overallSuccess")
+                quality = aiResult.get("qualityScore")
+                details = aiResult.get("validationDetails")
+                gap = aiResult.get("gapAnalysis", "")
+                criteria = aiResult.get("successCriteriaMet")
+                improvements = aiResult.get("improvementSuggestions", [])
+
+                # Normalize into schema-stable object without forcing failure defaults
+                normalized = {
+                    "overallSuccess": overall if isinstance(overall, bool) else None,
+                    "qualityScore": float(quality) if isinstance(quality, (int, float)) else None,
+                    "validationDetails": details if isinstance(details, list) else [{
                         "documentName": "AI Validation",
-                        "gapAnalysis": aiResult.get("gapAnalysis", ""),
-                        "successCriteriaMet": aiResult.get("successCriteriaMet", [False])
-                    }]),
-                    "improvementSuggestions": aiResult.get("improvementSuggestions", [])
+                        "gapAnalysis": gap,
+                        "successCriteriaMet": criteria if isinstance(criteria, list) else []
+                    }],
+                    "improvementSuggestions": improvements,
+                    "schemaCompliant": True,
+                    "originalType": "json",
+                    "missingFields": []
                 }
+
+                if normalized["overallSuccess"] is None:
+                    normalized["missingFields"].append("overallSuccess")
+                if normalized["qualityScore"] is None:
+                    normalized["missingFields"].append("qualityScore")
+                # If any critical field missing, mark as not fully compliant
+                if normalized["missingFields"]:
+                    normalized["schemaCompliant"] = False
+
+                return normalized
                 
             except json.JSONDecodeError as json_error:
                 logger.warning(f"All AI validation attempts failed - invalid JSON: {str(json_error)}")
diff --git a/modules/workflows/processing/adaptive/progressTracker.py b/modules/workflows/processing/adaptive/progressTracker.py
index 80dfcf63..b5a41533 100644
--- a/modules/workflows/processing/adaptive/progressTracker.py
+++ b/modules/workflows/processing/adaptive/progressTracker.py
@@ -20,11 +20,22 @@ class ProgressTracker:
     def updateProgress(self, result: Any, validation: Dict[str, Any], intent: Dict[str, Any]):
         """Updates progress tracking based on action result"""
         try:
-            overallSuccess = validation.get('overallSuccess', False)
-            qualityScore = validation.get('qualityScore', 0)
+            schemaCompliant = validation.get('schemaCompliant', True)
+            overallSuccess = validation.get('overallSuccess', None)
+            qualityScore = validation.get('qualityScore', None)
             improvementSuggestions = validation.get('improvementSuggestions', [])
             
-            if overallSuccess and qualityScore > 0.7:
+            # If validation is not schema compliant, treat as indeterminate (do not count as failure)
+            if not schemaCompliant or overallSuccess is None or qualityScore is None:
+                self.partialAchievements.append({
+                    "objective": intent.get('primaryGoal', 'Unknown'),
+                    "partialAchievement": "Validation indeterminate (schema non-compliant or missing fields)",
+                    "missingFields": validation.get('missingFields', []),
+                    "timestamp": datetime.now(timezone.utc).timestamp()
+                })
+                self.currentPhase = "partial"
+                logger.info(f"Indeterminate validation (no penalty): {intent.get('primaryGoal', 'Unknown')}")
+            elif overallSuccess and qualityScore > 0.7:
                 # Successful completion
                 self.completedObjectives.append({
                     "objective": intent.get('primaryGoal', 'Unknown'),
@@ -89,9 +100,13 @@ class ProgressTracker:
                 return False
             
             # If validation shows success, don't continue
-            if validation.get('overallSuccess', False):
+            if validation.get('schemaCompliant', True) and validation.get('overallSuccess', False):
                 return False
             
+            # If validation is not schema compliant, allow one refinement pass without counting as failure
+            if not validation.get('schemaCompliant', True):
+                return True
+
             # Otherwise, continue
             return True
             
diff --git a/modules/workflows/processing/modes/modeReact.py b/modules/workflows/processing/modes/modeReact.py
index 9a79dcb3..a2091ac0 100644
--- a/modules/workflows/processing/modes/modeReact.py
+++ b/modules/workflows/processing/modes/modeReact.py
@@ -240,12 +240,15 @@ class ReactMode(BaseMode):
                     if ref_match:
                         valid_refs.append(ref_match.group(1))
             
-            # Check if all provided references are valid
+            # Prefer non-empty documents: the available_docs index is already filtered to skip empty docs
+            preferred_refs = set(valid_refs)
+            
+            # Check if all provided references are valid and prefer non-empty
             for ref in document_refs:
-                if ref not in valid_refs:
-                    logger.error(f"Invalid document reference: {ref}")
+                if ref not in preferred_refs:
+                    logger.error(f"Invalid or empty document reference: {ref}")
                     logger.error(f"Available references: {valid_refs}")
-                    raise ValueError(f"Document reference '{ref}' not found in available documents. Use only exact references from AVAILABLE_DOCUMENTS_INDEX.")
+                    raise ValueError(f"Document reference '{ref}' not found or refers to empty document. Use only non-empty references from AVAILABLE_DOCUMENTS_INDEX.")
                     
         except Exception as e:
             logger.error(f"Error validating document references: {str(e)}")