From ebb15da91b15a7027639772359c23247a48a74fd Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Fri, 3 Oct 2025 19:46:10 +0200
Subject: [PATCH] testing flow

---
 modules/interfaces/interfaceAiObjects.py      | 17 +++++-
 modules/services/serviceAi/mainServiceAi.py   | 44 +++++++++++++-
 .../services/serviceExtraction/subPipeline.py | 28 +++++++++
 .../mainServiceGeneration.py                  | 60 ++++++++++++++++++-
 modules/workflows/methods/methodAi.py         | 39 +++++++++++-
 modules/workflows/processing/handlingTasks.py | 27 +++++----
 6 files changed, 195 insertions(+), 20 deletions(-)

diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py
index 3f1a265a..0df1c71c 100644
--- a/modules/interfaces/interfaceAiObjects.py
+++ b/modules/interfaces/interfaceAiObjects.py
@@ -349,6 +349,17 @@ class AiObjects:
         # Select model for text generation
         modelName = self._selectModel(prompt, context, options)
 
+        # Derive generation parameters
+        temperature = getattr(options, "temperature", None)
+        if temperature is None:
+            temperature = 0.2
+        maxTokens = getattr(options, "maxTokens", None)
+        # Provide a generous default to avoid truncation for long outputs
+        if maxTokens is None:
+            # If resultFormat suggests large outputs (e.g., html, json), allow more tokens
+            wants_large = str(getattr(options, "resultFormat", "")).lower() in ["html", "json", "md", "markdown"]
+            maxTokens = 8000 if wants_large else 2000
+
         messages: List[Dict[str, Any]] = []
         if context:
             messages.append({"role": "system", "content": f"Context from documents:\n{context}"})
@@ -360,11 +371,11 @@ class AiObjects:
         # Call the appropriate function
         if functionName == "callAiBasic":
             if aiModels[modelName]["connector"] == "openai":
-                content = await connector.callAiBasic(messages)
+                content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens)
             elif aiModels[modelName]["connector"] == "perplexity":
-                content = await connector.callAiBasic(messages)
+                content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens)
             else:
-                response = await connector.callAiBasic(messages)
+                response = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens)
                 content = response["choices"][0]["message"]["content"]
         elif functionName == "callAiWithWebSearch":
             # Perplexity web search function
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index fce2701b..c0e200d4 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -532,6 +532,17 @@ class AiService:
             if not isinstance(extractionResult, list):
                 return "[Error: No extraction results]"
             
+            # Prepare debug directory TODO TO REMOVE
+            import os
+            from datetime import datetime
+            debug_root = "../local/testing_extraction"
+            ts = datetime.now().strftime("%Y%m%d-%H%M%S")
+            debug_dir = os.path.join(debug_root, f"per_chunk_{ts}")
+            try:
+                os.makedirs(debug_dir, exist_ok=True)
+            except Exception:
+                pass
+
             # Process each chunk with AI
             aiResults: List[str] = []
             
@@ -568,6 +579,15 @@ class AiService:
                             logger.info(f"Chunk size: {len(part.data)} chars")
                             logger.info(f"Chunk preview: {part.data[:200]}...")
                             
+                            # Dump input chunk
+                            try:
+                                idx = len(aiResults) + 1
+                                fpath = os.path.join(debug_dir, f"chunk_{idx:03d}_input.txt")
+                                with open(fpath, "w", encoding="utf-8") as f:
+                                    f.write(str(part.data))
+                            except Exception:
+                                pass
+
                             # Create AI call request for this chunk
                             request = AiCallRequest(
                                 prompt=prompt,
@@ -580,6 +600,14 @@ class AiService:
                             aiResults.append(response.content)
                             
                             logger.info(f"Chunk {len(aiResults)} processed: {len(response.content)} chars response")
+                            # Dump AI response
+                            try:
+                                idx = len(aiResults)
+                                fpath = os.path.join(debug_dir, f"chunk_{idx:03d}_response.txt")
+                                with open(fpath, "w", encoding="utf-8") as f:
+                                    f.write(str(response.content))
+                            except Exception:
+                                pass
                             
                         except Exception as e:
                             logger.warning(f"Error processing text chunk: {str(e)}")
@@ -601,12 +629,24 @@ class AiService:
                 mergeStrategy
             )
             
-            # Extract text from merged content
+            # Extract only AI-generated text from merged content
             resultText = ""
             for part in mergedContent.parts:
-                if part.typeGroup in ("text", "table", "structure") and part.data:
+                if (
+                    part.typeGroup in ("text", "table", "structure")
+                    and part.data
+                    and getattr(part, "metadata", {}).get("aiResult", False)
+                ):
                     resultText += part.data + "\n\n"
             
+            # Dump merged output
+            try:
+                fpath = os.path.join(debug_dir, "merged_output.txt")
+                with open(fpath, "w", encoding="utf-8") as f:
+                    f.write(resultText.strip())
+            except Exception:
+                pass
+
             return resultText.strip()
             
         except Exception as e:
diff --git a/modules/services/serviceExtraction/subPipeline.py b/modules/services/serviceExtraction/subPipeline.py
index 14361966..d74cb974 100644
--- a/modules/services/serviceExtraction/subPipeline.py
+++ b/modules/services/serviceExtraction/subPipeline.py
@@ -1,5 +1,6 @@
 from typing import Any, Dict, List
 import logging
+import os
 
 from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart
 from .subUtils import makeId
@@ -91,6 +92,33 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
         parts = non_chunk_parts + chunk_parts
         
         logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})")
+    # DEBUG: dump parts and chunks to files under @testing_extraction/ TODO TO REMOVE
+    try:
+        base_dir = "../local/testing_extraction"
+        doc_dir = os.path.join(base_dir, f"extraction_{fileName}")
+        os.makedirs(doc_dir, exist_ok=True)
+        # Write a summary file
+        summary_lines: List[str] = [f"fileName: {fileName}", f"mimeType: {mimeType}", f"totalParts: {len(parts)}"]
+        text_index = 0
+        for idx, part in enumerate(parts):
+            is_texty = part.typeGroup in ("text", "table", "structure")
+            size = int(part.metadata.get("size", 0) or 0)
+            is_chunk = bool(part.metadata.get("chunk", False))
+            summary_lines.append(
+                f"part[{idx}]: typeGroup={part.typeGroup}, label={part.label}, size={size}, chunk={is_chunk}"
+            )
+            if is_texty and getattr(part, "data", None):
+                text_index += 1
+                fname = f"part_{idx:03d}_{'chunk' if is_chunk else 'full'}_{text_index:03d}.txt"
+                fpath = os.path.join(doc_dir, fname)
+                with open(fpath, "w", encoding="utf-8") as f:
+                    f.write(f"# typeGroup: {part.typeGroup}\n# label: {part.label}\n# chunk: {is_chunk}\n# size: {size}\n\n")
+                    f.write(str(part.data))
+        with open(os.path.join(doc_dir, "summary.txt"), "w", encoding="utf-8") as f:
+            f.write("\n".join(summary_lines))
+    except Exception as _e:
+        logger.debug(f"Debug dump skipped: {_e}")
+
     return ExtractedContent(id=makeId(), parts=parts)
 
 
diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py
index 14bfe7fe..e1168b3c 100644
--- a/modules/services/serviceGeneration/mainServiceGeneration.py
+++ b/modules/services/serviceGeneration/mainServiceGeneration.py
@@ -105,12 +105,49 @@ class GenerationService:
                     
                     logger.info(f"Document {document_name} has content: {len(content)} characters")
                     
+                    # Normalize file extension based on mime type if missing or incorrect
+                    try:
+                        mime_to_ext = {
+                            "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
+                            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
+                            "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
+                            "application/pdf": ".pdf",
+                            "text/html": ".html",
+                            "text/markdown": ".md",
+                            "text/plain": ".txt",
+                            "application/json": ".json",
+                        }
+                        expected_ext = mime_to_ext.get(mime_type)
+                        if expected_ext:
+                            if not document_name.lower().endswith(expected_ext):
+                                # Append/replace extension to match mime type
+                                if "." in document_name:
+                                    document_name = document_name.rsplit(".", 1)[0] + expected_ext
+                                else:
+                                    document_name = document_name + expected_ext
+                    except Exception:
+                        pass
+
+                    # Decide if content is base64-encoded binary (e.g., docx/pdf) or plain text
+                    base64encoded = False
+                    try:
+                        binary_mime_types = {
+                            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                            "application/pdf",
+                        }
+                        if isinstance(document_data, str) and mime_type in binary_mime_types:
+                            base64encoded = True
+                    except Exception:
+                        base64encoded = False
+
                     # Create document with file in one step using interfaces directly
                     document = self._createDocument(
                         fileName=document_name,
                         mimeType=mime_type,
                         content=content,
-                        base64encoded=False,
+                        base64encoded=base64encoded,
                         messageId=message_id
                     )
                     if document:
@@ -272,6 +309,20 @@ class GenerationService:
             tuple: (rendered_content, mime_type)
         """
         try:
+            # DEBUG: dump renderer input to @testing_extraction to diagnose JSON+HTML mixtures TODO REMOVE
+            try:
+                import os
+                ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
+                debug_root = "../local/testing_extraction"
+                debug_dir = os.path.join(debug_root, f"render_input_{ts}")
+                os.makedirs(debug_dir, exist_ok=True)
+                with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
+                    f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n")
+                with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
+                    f.write(extracted_content or "")
+            except Exception:
+                pass
+
             # Get the appropriate renderer for the format
             renderer = self._getFormatRenderer(output_format)
             if not renderer:
@@ -279,6 +330,13 @@ class GenerationService:
             
             # Render the content
             rendered_content, mime_type = await renderer.render(extracted_content, title)
+            # DEBUG: dump rendered output
+            try:
+                import os
+                with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
+                    f.write(rendered_content or "")
+            except Exception:
+                pass
             
             logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters")
             return rendered_content, mime_type
diff --git a/modules/workflows/methods/methodAi.py b/modules/workflows/methods/methodAi.py
index 0b12f53b..371b1bbc 100644
--- a/modules/workflows/methods/methodAi.py
+++ b/modules/workflows/methods/methodAi.py
@@ -30,12 +30,12 @@ class MethodAi(MethodBase):
     @action
     async def process(self, parameters: Dict[str, Any]) -> ActionResult:
         """
-        Perform an AI call for any type of task with optional document references
+        Perform a generic AI call with optional document references, producing plain text output
         
         Parameters:
             aiPrompt (str): The AI prompt for processing
             documentList (list, optional): List of document references to include in context
-            expectedDocumentFormat (str, optional): Expected document output format with extension, mimeType, description
+            expectedDocumentFormat (str, optional): Preferred output extension (string or dict). Note: This action only returns plain text content.
             processingMode (str, optional): Processing mode - use 'basic', 'advanced', or 'detailed' (defaults to 'basic')
             includeMetadata (bool, optional): Whether to include metadata (default: True)
             operationType (str, optional): Operation type - use 'general', 'generate_plan', 'analyse_content', 'generate_content', 'web_research', 'image_analysis', or 'image_generation'
@@ -146,6 +146,19 @@ If you need to return multiple documents, add more objects to the documents arra
                 options=options
             )
 
+            # DEBUG dump: write raw AI result to @testing_extraction/ TODO Remove
+            try:
+                import os
+                from datetime import datetime
+                debug_root = "../local/testing_extraction"
+                ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
+                debug_dir = os.path.join(debug_root, f"method_ai_{ts}")
+                os.makedirs(debug_dir, exist_ok=True)
+                with open(os.path.join(debug_dir, "raw_result.txt"), "w", encoding="utf-8") as f:
+                    f.write(str(result) if result is not None else "")
+            except Exception:
+                pass
+
             # Parse JSON response from AI and create proper ActionDocument objects
             import json
             import re
@@ -225,6 +238,28 @@ If you need to return multiple documents, add more objects to the documents arra
                     mimeType=output_mime_type
                 ))
             
+            # DEBUG dump: write parsed documents to files in the same debug folder
+            try:
+                # Reuse the same debug_dir if created above; otherwise create a new one
+                import os
+                from datetime import datetime
+                debug_root = "../local/testing_extraction"
+                ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
+                debug_dir = os.path.join(debug_root, f"method_ai_{ts}")
+                os.makedirs(debug_dir, exist_ok=True)
+                # Write a summary and individual documents
+                summary_lines: List[str] = [f"documents: {len(action_documents)}"]
+                for i, doc in enumerate(action_documents, 1):
+                    summary_lines.append(f"doc[{i}]: name={doc.documentName}, mimeType={doc.mimeType}")
+                    safe_name = doc.documentName or f"doc_{i:03d}.txt"
+                    fpath = os.path.join(debug_dir, safe_name)
+                    with open(fpath, "w", encoding="utf-8") as f:
+                        f.write(str(doc.documentData) if doc.documentData is not None else "")
+                with open(os.path.join(debug_dir, "summary.txt"), "w", encoding="utf-8") as f:
+                    f.write("\n".join(summary_lines))
+            except Exception:
+                pass
+
             # Return result in the standard ActionResult format with parsed documents
             return ActionResult.isSuccess(
                 documents=action_documents
diff --git a/modules/workflows/processing/handlingTasks.py b/modules/workflows/processing/handlingTasks.py
index a90ce8d8..431c4bd3 100644
--- a/modules/workflows/processing/handlingTasks.py
+++ b/modules/workflows/processing/handlingTasks.py
@@ -527,7 +527,7 @@ class HandlingTasks:
         user_prompt = extractUserPrompt(context)
         available_documents = extractAvailableDocuments(context)
         user_language = extractUserLanguage(self.services)
-        available_methods = extractAvailableMethods(self.service)
+        available_methods = extractAvailableMethods(self.services)
         
         # Create placeholders dictionary
         placeholders = {
@@ -574,7 +574,7 @@ class HandlingTasks:
         # Extract content for placeholders
         user_prompt = extractUserPrompt(context)
         available_documents = extractAvailableDocuments(context)
-        user_language = extractUserLanguage(self.service)
+        user_language = extractUserLanguage(self.services)
         
         # Get action signature
         method = action.get('method', '')
@@ -1480,16 +1480,19 @@ class HandlingTasks:
                 # Always use the action's execResultLabel for message creation to ensure proper document routing
                 message_result_label = action.execResultLabel
                 
-                # Create message first to get messageId, then create documents with messageId
-                message = await self.createActionMessage(action, result, workflow, message_result_label, [], task_step, task_index, action_index, total_actions)
-                if message:
-                    # Now create documents with the messageId
-                    created_documents = self.services.generation.createDocumentsFromActionResult(result, action, workflow, message.id)
-                    # Update the message with the created documents
-                    if created_documents:
-                        message.documents = created_documents
-                        # Update the message in the database
-                        self.services.interfaceDbChat.updateMessage(message.id, {"documents": [doc.dict() for doc in created_documents]})
+                # Create documents first, then create the message with documents attached in one write
+                created_documents = self.services.generation.createDocumentsFromActionResult(result, action, workflow, None)
+                message = await self.createActionMessage(
+                    action,
+                    result,
+                    workflow,
+                    message_result_label,
+                    created_documents,
+                    task_step,
+                    task_index,
+                    action_index,
+                    total_actions
+                )
                 
                 # Log action results
                 logger.info(f"Action completed successfully")