From 27107c8a670d24b1c673c1521a6c407b95332d40 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Wed, 10 Sep 2025 22:52:41 +0200
Subject: [PATCH] report consolidation
---
modules/methods/methodDocument.py | 195 +++++++++++++++++++++++-------
1 file changed, 148 insertions(+), 47 deletions(-)
diff --git a/modules/methods/methodDocument.py b/modules/methods/methodDocument.py
index d7dae427..89e35a88 100644
--- a/modules/methods/methodDocument.py
+++ b/modules/methods/methodDocument.py
@@ -184,12 +184,14 @@ class MethodDocument(MethodBase):
expectedDocumentFormats (list): Target formats
originalDocuments (list, optional): Original names
includeMetadata (bool, optional): Include metadata (default: True)
+ mergeDocuments (bool, optional): Merge all documents into single output (default: False)
"""
try:
document_list = parameters.get("documentList", [])
expected_document_formats = parameters.get("expectedDocumentFormats", [])
original_documents = parameters.get("originalDocuments", [])
include_metadata = parameters.get("includeMetadata", True)
+ merge_documents = parameters.get("mergeDocuments", False)
if not document_list:
return ActionResult.isFailure(
@@ -214,9 +216,8 @@ class MethodDocument(MethodBase):
if not original_documents:
original_documents = [doc.fileName if hasattr(doc, 'fileName') else str(doc.id) for doc in chat_documents]
- # Process each document individually with its own format conversion
- output_documents = []
-
+ # Extract content from all documents first
+ document_contents = []
for i, chat_document in enumerate(chat_documents):
# Extract content from this document directly - NO AI, just read the data as-is
# This ensures we get the original text content for format conversion
@@ -256,52 +257,75 @@ class MethodDocument(MethodBase):
logger.info(f"Extracted content from document {i+1}: {len(content)} characters")
- # Get the expected format for this document (or use default)
- target_format = None
- if i < len(expected_document_formats):
- target_format = expected_document_formats[i]
- elif len(expected_document_formats) > 0:
- # If fewer formats than documents, use the last format for remaining documents
- target_format = expected_document_formats[-1]
-
- if not target_format:
- logger.warning(f"No expected format for document {i+1}, skipping")
- continue
-
- # Use AI to convert format
- formatted_content = await self._convertContentToFormat(content, target_format)
- if not formatted_content:
- logger.warning(f"Failed to format document {i+1}, skipping")
- continue
-
- target_extension = target_format.get("extension", ".txt")
- target_mime_type = target_format.get("mimeType", "text/plain")
-
- # Create output fileName
- timestamp = int(get_utc_timestamp())
- if i < len(original_documents):
- base_name = original_documents[i].rsplit('.', 1)[0] if '.' in original_documents[i] else original_documents[i]
- else:
- base_name = f"document_{i+1}"
- output_fileName = f"{base_name}_generated_{self._format_timestamp_for_filename()}{target_extension}"
-
- # Create result data
- result_data = {
- "documentCount": 1,
- "content": formatted_content,
- "outputFormat": target_format,
- "originalDocument": original_documents[i] if i < len(original_documents) else f"document_{i+1}",
- "timestamp": get_utc_timestamp()
- }
-
- logger.info(f"Generated document: {output_fileName} with {len(formatted_content)} characters")
-
- output_documents.append({
- "documentName": output_fileName,
- "documentData": result_data,
- "mimeType": target_mime_type
+ document_contents.append({
+ "document": chat_document,
+ "content": content,
+ "index": i,
+ "original_name": original_documents[i] if i < len(original_documents) else f"document_{i+1}"
})
+ if not document_contents:
+ return ActionResult.isFailure(
+ error="No valid text content could be extracted from any documents"
+ )
+
+ if merge_documents and len(document_contents) > 1:
+ # Merge all documents into single output
+ logger.info("Merging all documents into single output")
+ return await self._mergeDocuments(document_contents, expected_document_formats, include_metadata)
+ else:
+ # Process each document individually with its own format conversion
+ logger.info("Processing documents individually")
+ output_documents = []
+
+ for item in document_contents:
+ chat_document = item["document"]
+ content = item["content"]
+ i = item["index"]
+ original_name = item["original_name"]
+
+ # Get the expected format for this document (or use default)
+ target_format = None
+ if i < len(expected_document_formats):
+ target_format = expected_document_formats[i]
+ elif len(expected_document_formats) > 0:
+ # If fewer formats than documents, use the last format for remaining documents
+ target_format = expected_document_formats[-1]
+
+ if not target_format:
+ logger.warning(f"No expected format for document {i+1}, skipping")
+ continue
+
+ # Use AI to convert format
+ formatted_content = await self._convertContentToFormat(content, target_format)
+ if not formatted_content:
+ logger.warning(f"Failed to format document {i+1}, skipping")
+ continue
+
+ target_extension = target_format.get("extension", ".txt")
+ target_mime_type = target_format.get("mimeType", "text/plain")
+
+ # Create output fileName
+ base_name = original_name.rsplit('.', 1)[0] if '.' in original_name else original_name
+ output_fileName = f"{base_name}_generated_{self._format_timestamp_for_filename()}{target_extension}"
+
+ # Create result data
+ result_data = {
+ "documentCount": 1,
+ "content": formatted_content,
+ "outputFormat": target_format,
+ "originalDocument": original_name,
+ "timestamp": get_utc_timestamp()
+ }
+
+ logger.info(f"Generated document: {output_fileName} with {len(formatted_content)} characters")
+
+ output_documents.append({
+ "documentName": output_fileName,
+ "documentData": result_data,
+ "mimeType": target_mime_type
+ })
+
if not output_documents:
return ActionResult.isFailure(
error="No documents could be generated"
@@ -316,6 +340,83 @@ class MethodDocument(MethodBase):
error=str(e)
)
+ async def _mergeDocuments(self, document_contents: List[Dict[str, Any]],
+ expected_document_formats: List[Dict[str, Any]],
+ include_metadata: bool) -> ActionResult:
+ """
+ Merge all documents into a single output document.
+ """
+ try:
+ # Combine all document content
+ combined_content_parts = []
+ original_file_names = []
+
+ for item in document_contents:
+ chat_document = item["document"]
+ content = item["content"]
+ original_name = item["original_name"]
+
+ if content.strip():
+ combined_content_parts.append(f"=== Document: {original_name} ===\n{content}\n")
+ original_file_names.append(original_name)
+
+ if not combined_content_parts:
+ return ActionResult.isFailure(
+ error="No content could be extracted from any documents for merging"
+ )
+
+ # Combine all content
+ combined_content = "\n".join(combined_content_parts)
+ logger.info(f"Combined content from {len(original_file_names)} documents: {len(combined_content)} characters")
+
+ # Get the expected format for the merged output
+ target_format = None
+ if expected_document_formats and len(expected_document_formats) > 0:
+ target_format = expected_document_formats[0] # Use first format for merged output
+
+ if not target_format:
+ logger.warning("No expected format specified for merged output, using plain text")
+ target_format = {"extension": ".txt", "mimeType": "text/plain"}
+
+ # Use AI to convert format
+ formatted_content = await self._convertContentToFormat(combined_content, target_format)
+ if not formatted_content:
+ logger.warning("Failed to format merged content, using raw content")
+ formatted_content = combined_content
+
+ target_extension = target_format.get("extension", ".txt")
+ target_mime_type = target_format.get("mimeType", "text/plain")
+
+ # Create output fileName for merged document
+ timestamp = self._format_timestamp_for_filename()
+ output_fileName = f"merged_documents_{timestamp}{target_extension}"
+
+ # Create result data for merged document
+ result_data = {
+ "documentCount": len(document_contents),
+ "content": formatted_content,
+ "outputFormat": target_format,
+ "originalDocuments": original_file_names,
+ "timestamp": get_utc_timestamp(),
+ "merged": True
+ }
+
+ logger.info(f"Created merged document: {output_fileName} with {len(formatted_content)} characters")
+
+ return ActionResult.isSuccess(
+ documents=[{
+ "documentName": output_fileName,
+ "documentData": result_data,
+ "mimeType": target_mime_type
+ }]
+ )
+
+ except Exception as e:
+ logger.error(f"Error merging documents: {str(e)}")
+ return ActionResult.isFailure(
+ error=f"Failed to merge documents: {str(e)}"
+ )
+
async def _convertContentToFormat(self, content: str, target_format: Dict[str, Any]) -> str:
"""
Helper function to convert content to the specified format using AI.