Merge pull request #36 from valueonag/int

report consolidation
2025-09-10 22:53:23 +02:00 · 2025-09-10 22:53:23 +02:00 · 2554f669c7
commit 2554f669c7
parent 2036ccf78b 27107c8a67
1 changed files with 148 additions and 47 deletions
--- a/modules/methods/methodDocument.py
+++ b/modules/methods/methodDocument.py
@ -184,12 +184,14 @@ class MethodDocument(MethodBase):
            expectedDocumentFormats (list): Target formats
            originalDocuments (list, optional): Original names
            includeMetadata (bool, optional): Include metadata (default: True)
            mergeDocuments (bool, optional): Merge all documents into single output (default: False)
        """
        try:
            document_list = parameters.get("documentList", [])
            expected_document_formats = parameters.get("expectedDocumentFormats", [])
            original_documents = parameters.get("originalDocuments", [])
            include_metadata = parameters.get("includeMetadata", True)
            merge_documents = parameters.get("mergeDocuments", False)
            if not document_list:
                return ActionResult.isFailure(
@ -214,9 +216,8 @@ class MethodDocument(MethodBase):
            if not original_documents:
                original_documents = [doc.fileName if hasattr(doc, 'fileName') else str(doc.id) for doc in chat_documents]
-            # Process each document individually with its own format conversion
+            # Extract content from all documents first
-            output_documents = []
+            document_contents = []
            for i, chat_document in enumerate(chat_documents):
                # Extract content from this document directly - NO AI, just read the data as-is
                # This ensures we get the original text content for format conversion
@ -256,52 +257,75 @@ class MethodDocument(MethodBase):
                logger.info(f"Extracted content from document {i+1}: {len(content)} characters")
-                # Get the expected format for this document (or use default)
+                document_contents.append({
-                target_format = None
+                    "document": chat_document,
-                if i < len(expected_document_formats):
+                    "content": content,
-                    target_format = expected_document_formats[i]
+                    "index": i,
-                elif len(expected_document_formats) > 0:
+                    "original_name": original_documents[i] if i < len(original_documents) else f"document_{i+1}"
                    # If fewer formats than documents, use the last format for remaining documents
                    target_format = expected_document_formats[-1]
                if not target_format:
                    logger.warning(f"No expected format for document {i+1}, skipping")
                    continue
                # Use AI to convert format
                formatted_content = await self._convertContentToFormat(content, target_format)
                if not formatted_content:
                    logger.warning(f"Failed to format document {i+1}, skipping")
                    continue
                target_extension = target_format.get("extension", ".txt")
                target_mime_type = target_format.get("mimeType", "text/plain")
                # Create output fileName
                timestamp = int(get_utc_timestamp())
                if i < len(original_documents):
                    base_name = original_documents[i].rsplit('.', 1)[0] if '.' in original_documents[i] else original_documents[i]
                else:
                    base_name = f"document_{i+1}"
                output_fileName = f"{base_name}_generated_{self._format_timestamp_for_filename()}{target_extension}"
                # Create result data
                result_data = {
                    "documentCount": 1,
                    "content": formatted_content,
                    "outputFormat": target_format,
                    "originalDocument": original_documents[i] if i < len(original_documents) else f"document_{i+1}",
                    "timestamp": get_utc_timestamp()
                }
                logger.info(f"Generated document: {output_fileName} with {len(formatted_content)} characters")
                output_documents.append({
                    "documentName": output_fileName,
                    "documentData": result_data,
                    "mimeType": target_mime_type
                })
            if not document_contents:
                return ActionResult.isFailure(
                    error="No valid text content could be extracted from any documents"
                )
            if merge_documents and len(document_contents) > 1:
                # Merge all documents into single output
                logger.info("Merging all documents into single output")
                return await self._mergeDocuments(document_contents, expected_document_formats, include_metadata)
            else:
                # Process each document individually with its own format conversion
                logger.info("Processing documents individually")
                output_documents = []
                for item in document_contents:
                    chat_document = item["document"]
                    content = item["content"]
                    i = item["index"]
                    original_name = item["original_name"]
                    # Get the expected format for this document (or use default)
                    target_format = None
                    if i < len(expected_document_formats):
                        target_format = expected_document_formats[i]
                    elif len(expected_document_formats) > 0:
                        # If fewer formats than documents, use the last format for remaining documents
                        target_format = expected_document_formats[-1]
                    if not target_format:
                        logger.warning(f"No expected format for document {i+1}, skipping")
                        continue
                    # Use AI to convert format
                    formatted_content = await self._convertContentToFormat(content, target_format)
                    if not formatted_content:
                        logger.warning(f"Failed to format document {i+1}, skipping")
                        continue
                    target_extension = target_format.get("extension", ".txt")
                    target_mime_type = target_format.get("mimeType", "text/plain")
                    # Create output fileName
                    base_name = original_name.rsplit('.', 1)[0] if '.' in original_name else original_name
                    output_fileName = f"{base_name}_generated_{self._format_timestamp_for_filename()}{target_extension}"
                    # Create result data
                    result_data = {
                        "documentCount": 1,
                        "content": formatted_content,
                        "outputFormat": target_format,
                        "originalDocument": original_name,
                        "timestamp": get_utc_timestamp()
                    }
                    logger.info(f"Generated document: {output_fileName} with {len(formatted_content)} characters")
                    output_documents.append({
                        "documentName": output_fileName,
                        "documentData": result_data,
                        "mimeType": target_mime_type
                    })
            if not output_documents:
                return ActionResult.isFailure(
                    error="No documents could be generated"
@ -316,6 +340,83 @@ class MethodDocument(MethodBase):
                error=str(e)
            )
    async def _mergeDocuments(self, document_contents: List[Dict[str, Any]], 
                            expected_document_formats: List[Dict[str, Any]], 
                            include_metadata: bool) -> ActionResult:
        """
        Merge all documents into a single output document.
        """
        try:
            # Combine all document content
            combined_content_parts = []
            original_file_names = []
            for item in document_contents:
                chat_document = item["document"]
                content = item["content"]
                original_name = item["original_name"]
                if content.strip():
                    combined_content_parts.append(f"=== Document: {original_name} ===\n{content}\n")
                    original_file_names.append(original_name)
            if not combined_content_parts:
                return ActionResult.isFailure(
                    error="No content could be extracted from any documents for merging"
                )
            # Combine all content
            combined_content = "\n".join(combined_content_parts)
            logger.info(f"Combined content from {len(original_file_names)} documents: {len(combined_content)} characters")
            # Get the expected format for the merged output
            target_format = None
            if expected_document_formats and len(expected_document_formats) > 0:
                target_format = expected_document_formats[0]  # Use first format for merged output
            if not target_format:
                logger.warning("No expected format specified for merged output, using plain text")
                target_format = {"extension": ".txt", "mimeType": "text/plain"}
            # Use AI to convert format
            formatted_content = await self._convertContentToFormat(combined_content, target_format)
            if not formatted_content:
                logger.warning("Failed to format merged content, using raw content")
                formatted_content = combined_content
            target_extension = target_format.get("extension", ".txt")
            target_mime_type = target_format.get("mimeType", "text/plain")
            # Create output fileName for merged document
            timestamp = self._format_timestamp_for_filename()
            output_fileName = f"merged_documents_{timestamp}{target_extension}"
            # Create result data for merged document
            result_data = {
                "documentCount": len(document_contents),
                "content": formatted_content,
                "outputFormat": target_format,
                "originalDocuments": original_file_names,
                "timestamp": get_utc_timestamp(),
                "merged": True
            }
            logger.info(f"Created merged document: {output_fileName} with {len(formatted_content)} characters")
            return ActionResult.isSuccess(
                documents=[{
                    "documentName": output_fileName,
                    "documentData": result_data,
                    "mimeType": target_mime_type
                }]
            )
        except Exception as e:
            logger.error(f"Error merging documents: {str(e)}")
            return ActionResult.isFailure(
                error=f"Failed to merge documents: {str(e)}"
            )
    async def _convertContentToFormat(self, content: str, target_format: Dict[str, Any]) -> str:
        """
        Helper function to convert content to the specified format using AI.