From 27107c8a670d24b1c673c1521a6c407b95332d40 Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Wed, 10 Sep 2025 22:52:41 +0200
Subject: [PATCH] report consolidation

---
 modules/methods/methodDocument.py | 195 +++++++++++++++++++++++-------
 1 file changed, 148 insertions(+), 47 deletions(-)

diff --git a/modules/methods/methodDocument.py b/modules/methods/methodDocument.py
index d7dae427..89e35a88 100644
--- a/modules/methods/methodDocument.py
+++ b/modules/methods/methodDocument.py
@@ -184,12 +184,14 @@ class MethodDocument(MethodBase):
             expectedDocumentFormats (list): Target formats
             originalDocuments (list, optional): Original names
             includeMetadata (bool, optional): Include metadata (default: True)
+            mergeDocuments (bool, optional): Merge all documents into single output (default: False)
         """
         try:
             document_list = parameters.get("documentList", [])
             expected_document_formats = parameters.get("expectedDocumentFormats", [])
             original_documents = parameters.get("originalDocuments", [])
             include_metadata = parameters.get("includeMetadata", True)
+            merge_documents = parameters.get("mergeDocuments", False)
             
             if not document_list:
                 return ActionResult.isFailure(
@@ -214,9 +216,8 @@ class MethodDocument(MethodBase):
             if not original_documents:
                 original_documents = [doc.fileName if hasattr(doc, 'fileName') else str(doc.id) for doc in chat_documents]
             
-            # Process each document individually with its own format conversion
-            output_documents = []
-            
+            # Extract content from all documents first
+            document_contents = []
             for i, chat_document in enumerate(chat_documents):
                 # Extract content from this document directly - NO AI, just read the data as-is
                 # This ensures we get the original text content for format conversion
@@ -256,52 +257,75 @@ class MethodDocument(MethodBase):
                 
                 logger.info(f"Extracted content from document {i+1}: {len(content)} characters")
                 
-                # Get the expected format for this document (or use default)
-                target_format = None
-                if i < len(expected_document_formats):
-                    target_format = expected_document_formats[i]
-                elif len(expected_document_formats) > 0:
-                    # If fewer formats than documents, use the last format for remaining documents
-                    target_format = expected_document_formats[-1]
-                
-                if not target_format:
-                    logger.warning(f"No expected format for document {i+1}, skipping")
-                    continue
-                
-                # Use AI to convert format
-                formatted_content = await self._convertContentToFormat(content, target_format)
-                if not formatted_content:
-                    logger.warning(f"Failed to format document {i+1}, skipping")
-                    continue
-                
-                target_extension = target_format.get("extension", ".txt")
-                target_mime_type = target_format.get("mimeType", "text/plain")
-                
-                # Create output fileName
-                timestamp = int(get_utc_timestamp())
-                if i < len(original_documents):
-                    base_name = original_documents[i].rsplit('.', 1)[0] if '.' in original_documents[i] else original_documents[i]
-                else:
-                    base_name = f"document_{i+1}"
-                output_fileName = f"{base_name}_generated_{self._format_timestamp_for_filename()}{target_extension}"
-                
-                # Create result data
-                result_data = {
-                    "documentCount": 1,
-                    "content": formatted_content,
-                    "outputFormat": target_format,
-                    "originalDocument": original_documents[i] if i < len(original_documents) else f"document_{i+1}",
-                    "timestamp": get_utc_timestamp()
-                }
-                
-                logger.info(f"Generated document: {output_fileName} with {len(formatted_content)} characters")
-                
-                output_documents.append({
-                    "documentName": output_fileName,
-                    "documentData": result_data,
-                    "mimeType": target_mime_type
+                document_contents.append({
+                    "document": chat_document,
+                    "content": content,
+                    "index": i,
+                    "original_name": original_documents[i] if i < len(original_documents) else f"document_{i+1}"
                 })
             
+            if not document_contents:
+                return ActionResult.isFailure(
+                    error="No valid text content could be extracted from any documents"
+                )
+            
+            if merge_documents and len(document_contents) > 1:
+                # Merge all documents into single output
+                logger.info("Merging all documents into single output")
+                return await self._mergeDocuments(document_contents, expected_document_formats, include_metadata)
+            else:
+                # Process each document individually with its own format conversion
+                logger.info("Processing documents individually")
+                output_documents = []
+            
+                for item in document_contents:
+                    chat_document = item["document"]
+                    content = item["content"]
+                    i = item["index"]
+                    original_name = item["original_name"]
+                    
+                    # Get the expected format for this document (or use default)
+                    target_format = None
+                    if i < len(expected_document_formats):
+                        target_format = expected_document_formats[i]
+                    elif len(expected_document_formats) > 0:
+                        # If fewer formats than documents, use the last format for remaining documents
+                        target_format = expected_document_formats[-1]
+                    
+                    if not target_format:
+                        logger.warning(f"No expected format for document {i+1}, skipping")
+                        continue
+                    
+                    # Use AI to convert format
+                    formatted_content = await self._convertContentToFormat(content, target_format)
+                    if not formatted_content:
+                        logger.warning(f"Failed to format document {i+1}, skipping")
+                        continue
+                    
+                    target_extension = target_format.get("extension", ".txt")
+                    target_mime_type = target_format.get("mimeType", "text/plain")
+                    
+                    # Create output fileName
+                    base_name = original_name.rsplit('.', 1)[0] if '.' in original_name else original_name
+                    output_fileName = f"{base_name}_generated_{self._format_timestamp_for_filename()}{target_extension}"
+                    
+                    # Create result data
+                    result_data = {
+                        "documentCount": 1,
+                        "content": formatted_content,
+                        "outputFormat": target_format,
+                        "originalDocument": original_name,
+                        "timestamp": get_utc_timestamp()
+                    }
+                    
+                    logger.info(f"Generated document: {output_fileName} with {len(formatted_content)} characters")
+                    
+                    output_documents.append({
+                        "documentName": output_fileName,
+                        "documentData": result_data,
+                        "mimeType": target_mime_type
+                    })
+            
             if not output_documents:
                 return ActionResult.isFailure(
                     error="No documents could be generated"
@@ -316,6 +340,83 @@ class MethodDocument(MethodBase):
                 error=str(e)
             )
 
+    async def _mergeDocuments(self, document_contents: List[Dict[str, Any]], 
+                            expected_document_formats: List[Dict[str, Any]], 
+                            include_metadata: bool) -> ActionResult:
+        """
+        Merge all documents into a single output document.
+        """
+        try:
+            # Combine all document content
+            combined_content_parts = []
+            original_file_names = []
+            
+            for item in document_contents:
+                chat_document = item["document"]
+                content = item["content"]
+                original_name = item["original_name"]
+                
+                if content.strip():
+                    combined_content_parts.append(f"=== Document: {original_name} ===\n{content}\n")
+                    original_file_names.append(original_name)
+            
+            if not combined_content_parts:
+                return ActionResult.isFailure(
+                    error="No content could be extracted from any documents for merging"
+                )
+            
+            # Combine all content
+            combined_content = "\n".join(combined_content_parts)
+            logger.info(f"Combined content from {len(original_file_names)} documents: {len(combined_content)} characters")
+            
+            # Get the expected format for the merged output
+            target_format = None
+            if expected_document_formats and len(expected_document_formats) > 0:
+                target_format = expected_document_formats[0]  # Use first format for merged output
+            
+            if not target_format:
+                logger.warning("No expected format specified for merged output, using plain text")
+                target_format = {"extension": ".txt", "mimeType": "text/plain"}
+            
+            # Use AI to convert format
+            formatted_content = await self._convertContentToFormat(combined_content, target_format)
+            if not formatted_content:
+                logger.warning("Failed to format merged content, using raw content")
+                formatted_content = combined_content
+            
+            target_extension = target_format.get("extension", ".txt")
+            target_mime_type = target_format.get("mimeType", "text/plain")
+            
+            # Create output fileName for merged document
+            timestamp = self._format_timestamp_for_filename()
+            output_fileName = f"merged_documents_{timestamp}{target_extension}"
+            
+            # Create result data for merged document
+            result_data = {
+                "documentCount": len(document_contents),
+                "content": formatted_content,
+                "outputFormat": target_format,
+                "originalDocuments": original_file_names,
+                "timestamp": get_utc_timestamp(),
+                "merged": True
+            }
+            
+            logger.info(f"Created merged document: {output_fileName} with {len(formatted_content)} characters")
+            
+            return ActionResult.isSuccess(
+                documents=[{
+                    "documentName": output_fileName,
+                    "documentData": result_data,
+                    "mimeType": target_mime_type
+                }]
+            )
+            
+        except Exception as e:
+            logger.error(f"Error merging documents: {str(e)}")
+            return ActionResult.isFailure(
+                error=f"Failed to merge documents: {str(e)}"
+            )
+
     async def _convertContentToFormat(self, content: str, target_format: Dict[str, Any]) -> str:
         """
         Helper function to convert content to the specified format using AI.