From ebb15da91b15a7027639772359c23247a48a74fd Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Fri, 3 Oct 2025 19:46:10 +0200 Subject: [PATCH] testing flow --- modules/interfaces/interfaceAiObjects.py | 17 +++++- modules/services/serviceAi/mainServiceAi.py | 44 +++++++++++++- .../services/serviceExtraction/subPipeline.py | 28 +++++++++ .../mainServiceGeneration.py | 60 ++++++++++++++++++- modules/workflows/methods/methodAi.py | 39 +++++++++++- modules/workflows/processing/handlingTasks.py | 27 +++++---- 6 files changed, 195 insertions(+), 20 deletions(-) diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py index 3f1a265a..0df1c71c 100644 --- a/modules/interfaces/interfaceAiObjects.py +++ b/modules/interfaces/interfaceAiObjects.py @@ -349,6 +349,17 @@ class AiObjects: # Select model for text generation modelName = self._selectModel(prompt, context, options) + # Derive generation parameters + temperature = getattr(options, "temperature", None) + if temperature is None: + temperature = 0.2 + maxTokens = getattr(options, "maxTokens", None) + # Provide a generous default to avoid truncation for long outputs + if maxTokens is None: + # If resultFormat suggests large outputs (e.g., html, json), allow more tokens + wants_large = str(getattr(options, "resultFormat", "")).lower() in ["html", "json", "md", "markdown"] + maxTokens = 8000 if wants_large else 2000 + messages: List[Dict[str, Any]] = [] if context: messages.append({"role": "system", "content": f"Context from documents:\n{context}"}) @@ -360,11 +371,11 @@ class AiObjects: # Call the appropriate function if functionName == "callAiBasic": if aiModels[modelName]["connector"] == "openai": - content = await connector.callAiBasic(messages) + content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens) elif aiModels[modelName]["connector"] == "perplexity": - content = await connector.callAiBasic(messages) + content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens) else: - response = await connector.callAiBasic(messages) + response = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens) content = response["choices"][0]["message"]["content"] elif functionName == "callAiWithWebSearch": # Perplexity web search function diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index fce2701b..c0e200d4 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -532,6 +532,17 @@ class AiService: if not isinstance(extractionResult, list): return "[Error: No extraction results]" + # Prepare debug directory TODO TO REMOVE + import os + from datetime import datetime + debug_root = "../local/testing_extraction" + ts = datetime.now().strftime("%Y%m%d-%H%M%S") + debug_dir = os.path.join(debug_root, f"per_chunk_{ts}") + try: + os.makedirs(debug_dir, exist_ok=True) + except Exception: + pass + # Process each chunk with AI aiResults: List[str] = [] @@ -568,6 +579,15 @@ class AiService: logger.info(f"Chunk size: {len(part.data)} chars") logger.info(f"Chunk preview: {part.data[:200]}...") + # Dump input chunk + try: + idx = len(aiResults) + 1 + fpath = os.path.join(debug_dir, f"chunk_{idx:03d}_input.txt") + with open(fpath, "w", encoding="utf-8") as f: + f.write(str(part.data)) + except Exception: + pass + # Create AI call request for this chunk request = AiCallRequest( prompt=prompt, @@ -580,6 +600,14 @@ class AiService: aiResults.append(response.content) logger.info(f"Chunk {len(aiResults)} processed: {len(response.content)} chars response") + # Dump AI response + try: + idx = len(aiResults) + fpath = os.path.join(debug_dir, f"chunk_{idx:03d}_response.txt") + with open(fpath, "w", encoding="utf-8") as f: + f.write(str(response.content)) + except Exception: + pass except Exception as e: logger.warning(f"Error processing text chunk: {str(e)}") @@ -601,12 +629,24 @@ class AiService: mergeStrategy ) - # Extract text from merged content + # Extract only AI-generated text from merged content resultText = "" for part in mergedContent.parts: - if part.typeGroup in ("text", "table", "structure") and part.data: + if ( + part.typeGroup in ("text", "table", "structure") + and part.data + and getattr(part, "metadata", {}).get("aiResult", False) + ): resultText += part.data + "\n\n" + # Dump merged output + try: + fpath = os.path.join(debug_dir, "merged_output.txt") + with open(fpath, "w", encoding="utf-8") as f: + f.write(resultText.strip()) + except Exception: + pass + return resultText.strip() except Exception as e: diff --git a/modules/services/serviceExtraction/subPipeline.py b/modules/services/serviceExtraction/subPipeline.py index 14361966..d74cb974 100644 --- a/modules/services/serviceExtraction/subPipeline.py +++ b/modules/services/serviceExtraction/subPipeline.py @@ -1,5 +1,6 @@ from typing import Any, Dict, List import logging +import os from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart from .subUtils import makeId @@ -91,6 +92,33 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker parts = non_chunk_parts + chunk_parts logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})") + # DEBUG: dump parts and chunks to files under @testing_extraction/ TODO TO REMOVE + try: + base_dir = "../local/testing_extraction" + doc_dir = os.path.join(base_dir, f"extraction_{fileName}") + os.makedirs(doc_dir, exist_ok=True) + # Write a summary file + summary_lines: List[str] = [f"fileName: {fileName}", f"mimeType: {mimeType}", f"totalParts: {len(parts)}"] + text_index = 0 + for idx, part in enumerate(parts): + is_texty = part.typeGroup in ("text", "table", "structure") + size = int(part.metadata.get("size", 0) or 0) + is_chunk = bool(part.metadata.get("chunk", False)) + summary_lines.append( + f"part[{idx}]: typeGroup={part.typeGroup}, label={part.label}, size={size}, chunk={is_chunk}" + ) + if is_texty and getattr(part, "data", None): + text_index += 1 + fname = f"part_{idx:03d}_{'chunk' if is_chunk else 'full'}_{text_index:03d}.txt" + fpath = os.path.join(doc_dir, fname) + with open(fpath, "w", encoding="utf-8") as f: + f.write(f"# typeGroup: {part.typeGroup}\n# label: {part.label}\n# chunk: {is_chunk}\n# size: {size}\n\n") + f.write(str(part.data)) + with open(os.path.join(doc_dir, "summary.txt"), "w", encoding="utf-8") as f: + f.write("\n".join(summary_lines)) + except Exception as _e: + logger.debug(f"Debug dump skipped: {_e}") + return ExtractedContent(id=makeId(), parts=parts) diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py index 14bfe7fe..e1168b3c 100644 --- a/modules/services/serviceGeneration/mainServiceGeneration.py +++ b/modules/services/serviceGeneration/mainServiceGeneration.py @@ -105,12 +105,49 @@ class GenerationService: logger.info(f"Document {document_name} has content: {len(content)} characters") + # Normalize file extension based on mime type if missing or incorrect + try: + mime_to_ext = { + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", + "application/pdf": ".pdf", + "text/html": ".html", + "text/markdown": ".md", + "text/plain": ".txt", + "application/json": ".json", + } + expected_ext = mime_to_ext.get(mime_type) + if expected_ext: + if not document_name.lower().endswith(expected_ext): + # Append/replace extension to match mime type + if "." in document_name: + document_name = document_name.rsplit(".", 1)[0] + expected_ext + else: + document_name = document_name + expected_ext + except Exception: + pass + + # Decide if content is base64-encoded binary (e.g., docx/pdf) or plain text + base64encoded = False + try: + binary_mime_types = { + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/pdf", + } + if isinstance(document_data, str) and mime_type in binary_mime_types: + base64encoded = True + except Exception: + base64encoded = False + # Create document with file in one step using interfaces directly document = self._createDocument( fileName=document_name, mimeType=mime_type, content=content, - base64encoded=False, + base64encoded=base64encoded, messageId=message_id ) if document: @@ -272,6 +309,20 @@ class GenerationService: tuple: (rendered_content, mime_type) """ try: + # DEBUG: dump renderer input to @testing_extraction to diagnose JSON+HTML mixtures TODO REMOVE + try: + import os + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_root = "../local/testing_extraction" + debug_dir = os.path.join(debug_root, f"render_input_{ts}") + os.makedirs(debug_dir, exist_ok=True) + with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f: + f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n") + with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f: + f.write(extracted_content or "") + except Exception: + pass + # Get the appropriate renderer for the format renderer = self._getFormatRenderer(output_format) if not renderer: @@ -279,6 +330,13 @@ class GenerationService: # Render the content rendered_content, mime_type = await renderer.render(extracted_content, title) + # DEBUG: dump rendered output + try: + import os + with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f: + f.write(rendered_content or "") + except Exception: + pass logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters") return rendered_content, mime_type diff --git a/modules/workflows/methods/methodAi.py b/modules/workflows/methods/methodAi.py index 0b12f53b..371b1bbc 100644 --- a/modules/workflows/methods/methodAi.py +++ b/modules/workflows/methods/methodAi.py @@ -30,12 +30,12 @@ class MethodAi(MethodBase): @action async def process(self, parameters: Dict[str, Any]) -> ActionResult: """ - Perform an AI call for any type of task with optional document references + Perform a generic AI call with optional document references, producing plain text output Parameters: aiPrompt (str): The AI prompt for processing documentList (list, optional): List of document references to include in context - expectedDocumentFormat (str, optional): Expected document output format with extension, mimeType, description + expectedDocumentFormat (str, optional): Preferred output extension (string or dict). Note: This action only returns plain text content. processingMode (str, optional): Processing mode - use 'basic', 'advanced', or 'detailed' (defaults to 'basic') includeMetadata (bool, optional): Whether to include metadata (default: True) operationType (str, optional): Operation type - use 'general', 'generate_plan', 'analyse_content', 'generate_content', 'web_research', 'image_analysis', or 'image_generation' @@ -146,6 +146,19 @@ If you need to return multiple documents, add more objects to the documents arra options=options ) + # DEBUG dump: write raw AI result to @testing_extraction/ TODO Remove + try: + import os + from datetime import datetime + debug_root = "../local/testing_extraction" + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_dir = os.path.join(debug_root, f"method_ai_{ts}") + os.makedirs(debug_dir, exist_ok=True) + with open(os.path.join(debug_dir, "raw_result.txt"), "w", encoding="utf-8") as f: + f.write(str(result) if result is not None else "") + except Exception: + pass + # Parse JSON response from AI and create proper ActionDocument objects import json import re @@ -225,6 +238,28 @@ If you need to return multiple documents, add more objects to the documents arra mimeType=output_mime_type )) + # DEBUG dump: write parsed documents to files in the same debug folder + try: + # Reuse the same debug_dir if created above; otherwise create a new one + import os + from datetime import datetime + debug_root = "../local/testing_extraction" + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_dir = os.path.join(debug_root, f"method_ai_{ts}") + os.makedirs(debug_dir, exist_ok=True) + # Write a summary and individual documents + summary_lines: List[str] = [f"documents: {len(action_documents)}"] + for i, doc in enumerate(action_documents, 1): + summary_lines.append(f"doc[{i}]: name={doc.documentName}, mimeType={doc.mimeType}") + safe_name = doc.documentName or f"doc_{i:03d}.txt" + fpath = os.path.join(debug_dir, safe_name) + with open(fpath, "w", encoding="utf-8") as f: + f.write(str(doc.documentData) if doc.documentData is not None else "") + with open(os.path.join(debug_dir, "summary.txt"), "w", encoding="utf-8") as f: + f.write("\n".join(summary_lines)) + except Exception: + pass + # Return result in the standard ActionResult format with parsed documents return ActionResult.isSuccess( documents=action_documents diff --git a/modules/workflows/processing/handlingTasks.py b/modules/workflows/processing/handlingTasks.py index a90ce8d8..431c4bd3 100644 --- a/modules/workflows/processing/handlingTasks.py +++ b/modules/workflows/processing/handlingTasks.py @@ -527,7 +527,7 @@ class HandlingTasks: user_prompt = extractUserPrompt(context) available_documents = extractAvailableDocuments(context) user_language = extractUserLanguage(self.services) - available_methods = extractAvailableMethods(self.service) + available_methods = extractAvailableMethods(self.services) # Create placeholders dictionary placeholders = { @@ -574,7 +574,7 @@ class HandlingTasks: # Extract content for placeholders user_prompt = extractUserPrompt(context) available_documents = extractAvailableDocuments(context) - user_language = extractUserLanguage(self.service) + user_language = extractUserLanguage(self.services) # Get action signature method = action.get('method', '') @@ -1480,16 +1480,19 @@ class HandlingTasks: # Always use the action's execResultLabel for message creation to ensure proper document routing message_result_label = action.execResultLabel - # Create message first to get messageId, then create documents with messageId - message = await self.createActionMessage(action, result, workflow, message_result_label, [], task_step, task_index, action_index, total_actions) - if message: - # Now create documents with the messageId - created_documents = self.services.generation.createDocumentsFromActionResult(result, action, workflow, message.id) - # Update the message with the created documents - if created_documents: - message.documents = created_documents - # Update the message in the database - self.services.interfaceDbChat.updateMessage(message.id, {"documents": [doc.dict() for doc in created_documents]}) + # Create documents first, then create the message with documents attached in one write + created_documents = self.services.generation.createDocumentsFromActionResult(result, action, workflow, None) + message = await self.createActionMessage( + action, + result, + workflow, + message_result_label, + created_documents, + task_step, + task_index, + action_index, + total_actions + ) # Log action results logger.info(f"Action completed successfully")