diff --git a/modules/methods/methodDocument.py b/modules/methods/methodDocument.py index 585d952c..0146215b 100644 --- a/modules/methods/methodDocument.py +++ b/modules/methods/methodDocument.py @@ -149,6 +149,23 @@ class MethodDocument(MethodBase): # Fallback: convert to string representation text_content = str(extracted_content) + # Skip empty or whitespace-only content + if not text_content or text_content.strip() == "": + logger.info(f"Skipping document {chatDocument.filename} - extraction result is empty or whitespace only") + continue + + # Skip minimal content that is essentially empty (like "{}", "[]", etc.) + stripped_content = text_content.strip() + minimal_content_patterns = ['{}', '[]', '""', "''", 'null', 'undefined'] + if stripped_content in minimal_content_patterns: + logger.info(f"Skipping document {chatDocument.filename} - extraction result is minimal content: '{stripped_content}'") + continue + + # Skip content that's just whitespace or very short meaningless content + if len(stripped_content) <= 2: + logger.info(f"Skipping document {chatDocument.filename} - extraction result is too short: '{stripped_content}' ({len(stripped_content)} chars)") + continue + # Create output filename based on original filename original_filename = chatDocument.filename base_name = original_filename.rsplit('.', 1)[0] if '.' in original_filename else original_filename @@ -172,6 +189,14 @@ class MethodDocument(MethodBase): "mimeType": output_mime_type }) + # Check if we have any valid output documents + if not output_documents: + return self._createResult( + success=False, + data={}, + error="No valid content could be extracted from any documents (all results were empty or whitespace only)" + ) + return self._createResult( success=True, data={ @@ -252,7 +277,88 @@ class MethodDocument(MethodBase): def _generateHtmlReport(self, chatDocuments: List[Any], title: str, includeMetadata: bool) -> str: """ - Generate a simple HTML report from chat documents. + Generate a comprehensive HTML report using AI from all input documents. + """ + try: + # Filter out empty documents and collect content + validDocuments = [] + allContent = [] + + for doc in chatDocuments: + content = "" + if hasattr(doc, 'content') and doc.content: + content = doc.content.strip() + elif hasattr(doc, 'data') and doc.data: + content = doc.data.strip() + + # Skip empty documents + if content: + validDocuments.append(doc) + allContent.append(f"Document: {doc.filename}\n{content}\n") + + if not validDocuments: + # If no valid documents, create a simple report + html = ["
Generated: {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')}
") + html.append("No content available in the provided documents.
") + html.append("") + return '\n'.join(html) + + # Create AI prompt for comprehensive report generation + combinedContent = "\n\n".join(allContent) + aiPrompt = f""" + Create a comprehensive, well-structured HTML report based on the following documents and content. + + Report Title: {title} + + Requirements: + 1. Create a professional, well-formatted HTML report + 2. Include an executive summary at the beginning + 3. Organize information logically with clear sections + 4. Highlight key findings and insights + 5. Include relevant data, statistics, and conclusions + 6. Use proper HTML formatting with headers, lists, and styling + 7. Make it readable and professional + + Document Content: + {combinedContent} + + Generate a complete HTML report that integrates all the information into a cohesive, professional document. + """ + + # Call AI to generate the report + logger.info(f"Generating AI report for {len(validDocuments)} documents") + aiReport = self.service.callAiTextBasic(aiPrompt, combinedContent) + + # If AI call fails, fall back to basic HTML + if not aiReport or aiReport.strip() == "": + logger.warning("AI report generation failed, using fallback HTML") + return self._generateFallbackHtmlReport(validDocuments, title, includeMetadata) + + # Clean up the AI response and ensure it's valid HTML + if not aiReport.strip().startswith('Generated: {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')}
") + html.append(f"Total Documents Analyzed: {len(validDocuments)}
") + html.append("No content available
")