From a602d13e16790ab2c60e2706661858cd75cc2f22 Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Tue, 15 Jul 2025 23:58:24 +0200
Subject: [PATCH] MVP2 READY - 95% works

---
 modules/methods/methodDocument.py | 116 +++++++++++++++++++++++++++++-
 1 file changed, 114 insertions(+), 2 deletions(-)
diff --git a/modules/methods/methodDocument.py b/modules/methods/methodDocument.py
index 585d952c..0146215b 100644
--- a/modules/methods/methodDocument.py
+++ b/modules/methods/methodDocument.py
@@ -149,6 +149,23 @@ class MethodDocument(MethodBase):
                     # Fallback: convert to string representation
                     text_content = str(extracted_content)
                 
+                # Skip empty or whitespace-only content
+                if not text_content or text_content.strip() == "":
+                    logger.info(f"Skipping document {chatDocument.filename} - extraction result is empty or whitespace only")
+                    continue
+                
+                # Skip minimal content that is essentially empty (like "{}", "[]", etc.)
+                stripped_content = text_content.strip()
+                minimal_content_patterns = ['{}', '[]', '""', "''", 'null', 'undefined']
+                if stripped_content in minimal_content_patterns:
+                    logger.info(f"Skipping document {chatDocument.filename} - extraction result is minimal content: '{stripped_content}'")
+                    continue
+                
+                # Skip content that's just whitespace or very short meaningless content
+                if len(stripped_content) <= 2:
+                    logger.info(f"Skipping document {chatDocument.filename} - extraction result is too short: '{stripped_content}' ({len(stripped_content)} chars)")
+                    continue
+                
                 # Create output filename based on original filename
                 original_filename = chatDocument.filename
                 base_name = original_filename.rsplit('.', 1)[0] if '.' in original_filename else original_filename
@@ -172,6 +189,14 @@ class MethodDocument(MethodBase):
                     "mimeType": output_mime_type
                 })
             
+            # Check if we have any valid output documents
+            if not output_documents:
+                return self._createResult(
+                    success=False,
+                    data={},
+                    error="No valid content could be extracted from any documents (all results were empty or whitespace only)"
+                )
+            
             return self._createResult(
                 success=True,
                 data={
@@ -252,7 +277,88 @@ class MethodDocument(MethodBase):
 
     def _generateHtmlReport(self, chatDocuments: List[Any], title: str, includeMetadata: bool) -> str:
         """
-        Generate a simple HTML report from chat documents.
+        Generate a comprehensive HTML report using AI from all input documents.
+        """
+        try:
+            # Filter out empty documents and collect content
+            validDocuments = []
+            allContent = []
+            
+            for doc in chatDocuments:
+                content = ""
+                if hasattr(doc, 'content') and doc.content:
+                    content = doc.content.strip()
+                elif hasattr(doc, 'data') and doc.data:
+                    content = doc.data.strip()
+                
+                # Skip empty documents
+                if content:
+                    validDocuments.append(doc)
+                    allContent.append(f"Document: {doc.filename}\n{content}\n")
+            
+            if not validDocuments:
+                # If no valid documents, create a simple report
+                html = ["<html><head><meta charset='utf-8'><title>" + title + "</title></head><body>"]
+                html.append(f"<h1>{title}</h1>")
+                html.append(f"<p><b>Generated:</b> {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')}</p>")
+                html.append("<p><em>No content available in the provided documents.</em></p>")
+                html.append("</body></html>")
+                return '\n'.join(html)
+            
+            # Create AI prompt for comprehensive report generation
+            combinedContent = "\n\n".join(allContent)
+            aiPrompt = f"""
+            Create a comprehensive, well-structured HTML report based on the following documents and content.
+            
+            Report Title: {title}
+            
+            Requirements:
+            1. Create a professional, well-formatted HTML report
+            2. Include an executive summary at the beginning
+            3. Organize information logically with clear sections
+            4. Highlight key findings and insights
+            5. Include relevant data, statistics, and conclusions
+            6. Use proper HTML formatting with headers, lists, and styling
+            7. Make it readable and professional
+            
+            Document Content:
+            {combinedContent}
+            
+            Generate a complete HTML report that integrates all the information into a cohesive, professional document.
+            """
+            
+            # Call AI to generate the report
+            logger.info(f"Generating AI report for {len(validDocuments)} documents")
+            aiReport = self.service.callAiTextBasic(aiPrompt, combinedContent)
+            
+            # If AI call fails, fall back to basic HTML
+            if not aiReport or aiReport.strip() == "":
+                logger.warning("AI report generation failed, using fallback HTML")
+                return self._generateFallbackHtmlReport(validDocuments, title, includeMetadata)
+            
+            # Clean up the AI response and ensure it's valid HTML
+            if not aiReport.strip().startswith('<html'):
+                # Wrap the AI content in proper HTML structure
+                html = ["<html><head><meta charset='utf-8'><title>" + title + "</title></head><body>"]
+                html.append(f"<h1>{title}</h1>")
+                html.append(f"<p><b>Generated:</b> {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')}</p>")
+                html.append(f"<p><b>Total Documents Analyzed:</b> {len(validDocuments)}</p>")
+                html.append("<hr>")
+                html.append(aiReport)
+                html.append("</body></html>")
+                return '\n'.join(html)
+            else:
+                # AI returned complete HTML, use it directly
+                return aiReport
+                
+        except Exception as e:
+            logger.error(f"Error generating AI report: {str(e)}")
+            # Fall back to basic HTML report
+            return self._generateFallbackHtmlReport(chatDocuments, title, includeMetadata)
+    
+    def _generateFallbackHtmlReport(self, chatDocuments: List[Any], title: str, includeMetadata: bool) -> str:
+        """
+        Generate a basic HTML report as fallback when AI generation fails.
         """
         html = ["<html><head><meta charset='utf-8'><title>" + title + "</title></head><body>"]
         html.append(f"<h1>{title}</h1>")
@@ -272,8 +378,14 @@ class MethodDocument(MethodBase):
                 html.append("</ul>")
             
             # Add document content if available
+            content = ""
             if hasattr(doc, 'content') and doc.content:
-                html.append(f"<div style='white-space:pre-wrap; border:1px solid #ccc; padding:0.5em; margin-bottom:1em; background-color:#f9f9f9;'>{doc.content}</div>")
+                content = doc.content
+            elif hasattr(doc, 'data') and doc.data:
+                content = doc.data
+            
+            if content:
+                html.append(f"<div style='white-space:pre-wrap; border:1px solid #ccc; padding:0.5em; margin-bottom:1em; background-color:#f9f9f9;'>{content}</div>")
             else:
                 html.append("<p><em>No content available</em></p>")