testing flow

This commit is contained in:
ValueOn AG 2025-10-03 19:46:10 +02:00
parent 418bd8281a
commit ebb15da91b
6 changed files with 195 additions and 20 deletions

View file

@ -349,6 +349,17 @@ class AiObjects:
# Select model for text generation # Select model for text generation
modelName = self._selectModel(prompt, context, options) modelName = self._selectModel(prompt, context, options)
# Derive generation parameters
temperature = getattr(options, "temperature", None)
if temperature is None:
temperature = 0.2
maxTokens = getattr(options, "maxTokens", None)
# Provide a generous default to avoid truncation for long outputs
if maxTokens is None:
# If resultFormat suggests large outputs (e.g., html, json), allow more tokens
wants_large = str(getattr(options, "resultFormat", "")).lower() in ["html", "json", "md", "markdown"]
maxTokens = 8000 if wants_large else 2000
messages: List[Dict[str, Any]] = [] messages: List[Dict[str, Any]] = []
if context: if context:
messages.append({"role": "system", "content": f"Context from documents:\n{context}"}) messages.append({"role": "system", "content": f"Context from documents:\n{context}"})
@ -360,11 +371,11 @@ class AiObjects:
# Call the appropriate function # Call the appropriate function
if functionName == "callAiBasic": if functionName == "callAiBasic":
if aiModels[modelName]["connector"] == "openai": if aiModels[modelName]["connector"] == "openai":
content = await connector.callAiBasic(messages) content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens)
elif aiModels[modelName]["connector"] == "perplexity": elif aiModels[modelName]["connector"] == "perplexity":
content = await connector.callAiBasic(messages) content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens)
else: else:
response = await connector.callAiBasic(messages) response = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens)
content = response["choices"][0]["message"]["content"] content = response["choices"][0]["message"]["content"]
elif functionName == "callAiWithWebSearch": elif functionName == "callAiWithWebSearch":
# Perplexity web search function # Perplexity web search function

View file

@ -532,6 +532,17 @@ class AiService:
if not isinstance(extractionResult, list): if not isinstance(extractionResult, list):
return "[Error: No extraction results]" return "[Error: No extraction results]"
# Prepare debug directory TODO TO REMOVE
import os
from datetime import datetime
debug_root = "../local/testing_extraction"
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
debug_dir = os.path.join(debug_root, f"per_chunk_{ts}")
try:
os.makedirs(debug_dir, exist_ok=True)
except Exception:
pass
# Process each chunk with AI # Process each chunk with AI
aiResults: List[str] = [] aiResults: List[str] = []
@ -568,6 +579,15 @@ class AiService:
logger.info(f"Chunk size: {len(part.data)} chars") logger.info(f"Chunk size: {len(part.data)} chars")
logger.info(f"Chunk preview: {part.data[:200]}...") logger.info(f"Chunk preview: {part.data[:200]}...")
# Dump input chunk
try:
idx = len(aiResults) + 1
fpath = os.path.join(debug_dir, f"chunk_{idx:03d}_input.txt")
with open(fpath, "w", encoding="utf-8") as f:
f.write(str(part.data))
except Exception:
pass
# Create AI call request for this chunk # Create AI call request for this chunk
request = AiCallRequest( request = AiCallRequest(
prompt=prompt, prompt=prompt,
@ -580,6 +600,14 @@ class AiService:
aiResults.append(response.content) aiResults.append(response.content)
logger.info(f"Chunk {len(aiResults)} processed: {len(response.content)} chars response") logger.info(f"Chunk {len(aiResults)} processed: {len(response.content)} chars response")
# Dump AI response
try:
idx = len(aiResults)
fpath = os.path.join(debug_dir, f"chunk_{idx:03d}_response.txt")
with open(fpath, "w", encoding="utf-8") as f:
f.write(str(response.content))
except Exception:
pass
except Exception as e: except Exception as e:
logger.warning(f"Error processing text chunk: {str(e)}") logger.warning(f"Error processing text chunk: {str(e)}")
@ -601,12 +629,24 @@ class AiService:
mergeStrategy mergeStrategy
) )
# Extract text from merged content # Extract only AI-generated text from merged content
resultText = "" resultText = ""
for part in mergedContent.parts: for part in mergedContent.parts:
if part.typeGroup in ("text", "table", "structure") and part.data: if (
part.typeGroup in ("text", "table", "structure")
and part.data
and getattr(part, "metadata", {}).get("aiResult", False)
):
resultText += part.data + "\n\n" resultText += part.data + "\n\n"
# Dump merged output
try:
fpath = os.path.join(debug_dir, "merged_output.txt")
with open(fpath, "w", encoding="utf-8") as f:
f.write(resultText.strip())
except Exception:
pass
return resultText.strip() return resultText.strip()
except Exception as e: except Exception as e:

View file

@ -1,5 +1,6 @@
from typing import Any, Dict, List from typing import Any, Dict, List
import logging import logging
import os
from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart
from .subUtils import makeId from .subUtils import makeId
@ -91,6 +92,33 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
parts = non_chunk_parts + chunk_parts parts = non_chunk_parts + chunk_parts
logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})") logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})")
# DEBUG: dump parts and chunks to files under @testing_extraction/ TODO TO REMOVE
try:
base_dir = "../local/testing_extraction"
doc_dir = os.path.join(base_dir, f"extraction_{fileName}")
os.makedirs(doc_dir, exist_ok=True)
# Write a summary file
summary_lines: List[str] = [f"fileName: {fileName}", f"mimeType: {mimeType}", f"totalParts: {len(parts)}"]
text_index = 0
for idx, part in enumerate(parts):
is_texty = part.typeGroup in ("text", "table", "structure")
size = int(part.metadata.get("size", 0) or 0)
is_chunk = bool(part.metadata.get("chunk", False))
summary_lines.append(
f"part[{idx}]: typeGroup={part.typeGroup}, label={part.label}, size={size}, chunk={is_chunk}"
)
if is_texty and getattr(part, "data", None):
text_index += 1
fname = f"part_{idx:03d}_{'chunk' if is_chunk else 'full'}_{text_index:03d}.txt"
fpath = os.path.join(doc_dir, fname)
with open(fpath, "w", encoding="utf-8") as f:
f.write(f"# typeGroup: {part.typeGroup}\n# label: {part.label}\n# chunk: {is_chunk}\n# size: {size}\n\n")
f.write(str(part.data))
with open(os.path.join(doc_dir, "summary.txt"), "w", encoding="utf-8") as f:
f.write("\n".join(summary_lines))
except Exception as _e:
logger.debug(f"Debug dump skipped: {_e}")
return ExtractedContent(id=makeId(), parts=parts) return ExtractedContent(id=makeId(), parts=parts)

View file

@ -105,12 +105,49 @@ class GenerationService:
logger.info(f"Document {document_name} has content: {len(content)} characters") logger.info(f"Document {document_name} has content: {len(content)} characters")
# Normalize file extension based on mime type if missing or incorrect
try:
mime_to_ext = {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
"application/pdf": ".pdf",
"text/html": ".html",
"text/markdown": ".md",
"text/plain": ".txt",
"application/json": ".json",
}
expected_ext = mime_to_ext.get(mime_type)
if expected_ext:
if not document_name.lower().endswith(expected_ext):
# Append/replace extension to match mime type
if "." in document_name:
document_name = document_name.rsplit(".", 1)[0] + expected_ext
else:
document_name = document_name + expected_ext
except Exception:
pass
# Decide if content is base64-encoded binary (e.g., docx/pdf) or plain text
base64encoded = False
try:
binary_mime_types = {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/pdf",
}
if isinstance(document_data, str) and mime_type in binary_mime_types:
base64encoded = True
except Exception:
base64encoded = False
# Create document with file in one step using interfaces directly # Create document with file in one step using interfaces directly
document = self._createDocument( document = self._createDocument(
fileName=document_name, fileName=document_name,
mimeType=mime_type, mimeType=mime_type,
content=content, content=content,
base64encoded=False, base64encoded=base64encoded,
messageId=message_id messageId=message_id
) )
if document: if document:
@ -272,6 +309,20 @@ class GenerationService:
tuple: (rendered_content, mime_type) tuple: (rendered_content, mime_type)
""" """
try: try:
# DEBUG: dump renderer input to @testing_extraction to diagnose JSON+HTML mixtures TODO REMOVE
try:
import os
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
debug_root = "../local/testing_extraction"
debug_dir = os.path.join(debug_root, f"render_input_{ts}")
os.makedirs(debug_dir, exist_ok=True)
with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n")
with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
f.write(extracted_content or "")
except Exception:
pass
# Get the appropriate renderer for the format # Get the appropriate renderer for the format
renderer = self._getFormatRenderer(output_format) renderer = self._getFormatRenderer(output_format)
if not renderer: if not renderer:
@ -279,6 +330,13 @@ class GenerationService:
# Render the content # Render the content
rendered_content, mime_type = await renderer.render(extracted_content, title) rendered_content, mime_type = await renderer.render(extracted_content, title)
# DEBUG: dump rendered output
try:
import os
with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
f.write(rendered_content or "")
except Exception:
pass
logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters") logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters")
return rendered_content, mime_type return rendered_content, mime_type

View file

@ -30,12 +30,12 @@ class MethodAi(MethodBase):
@action @action
async def process(self, parameters: Dict[str, Any]) -> ActionResult: async def process(self, parameters: Dict[str, Any]) -> ActionResult:
""" """
Perform an AI call for any type of task with optional document references Perform a generic AI call with optional document references, producing plain text output
Parameters: Parameters:
aiPrompt (str): The AI prompt for processing aiPrompt (str): The AI prompt for processing
documentList (list, optional): List of document references to include in context documentList (list, optional): List of document references to include in context
expectedDocumentFormat (str, optional): Expected document output format with extension, mimeType, description expectedDocumentFormat (str, optional): Preferred output extension (string or dict). Note: This action only returns plain text content.
processingMode (str, optional): Processing mode - use 'basic', 'advanced', or 'detailed' (defaults to 'basic') processingMode (str, optional): Processing mode - use 'basic', 'advanced', or 'detailed' (defaults to 'basic')
includeMetadata (bool, optional): Whether to include metadata (default: True) includeMetadata (bool, optional): Whether to include metadata (default: True)
operationType (str, optional): Operation type - use 'general', 'generate_plan', 'analyse_content', 'generate_content', 'web_research', 'image_analysis', or 'image_generation' operationType (str, optional): Operation type - use 'general', 'generate_plan', 'analyse_content', 'generate_content', 'web_research', 'image_analysis', or 'image_generation'
@ -146,6 +146,19 @@ If you need to return multiple documents, add more objects to the documents arra
options=options options=options
) )
# DEBUG dump: write raw AI result to @testing_extraction/ TODO Remove
try:
import os
from datetime import datetime
debug_root = "../local/testing_extraction"
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
debug_dir = os.path.join(debug_root, f"method_ai_{ts}")
os.makedirs(debug_dir, exist_ok=True)
with open(os.path.join(debug_dir, "raw_result.txt"), "w", encoding="utf-8") as f:
f.write(str(result) if result is not None else "")
except Exception:
pass
# Parse JSON response from AI and create proper ActionDocument objects # Parse JSON response from AI and create proper ActionDocument objects
import json import json
import re import re
@ -225,6 +238,28 @@ If you need to return multiple documents, add more objects to the documents arra
mimeType=output_mime_type mimeType=output_mime_type
)) ))
# DEBUG dump: write parsed documents to files in the same debug folder
try:
# Reuse the same debug_dir if created above; otherwise create a new one
import os
from datetime import datetime
debug_root = "../local/testing_extraction"
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
debug_dir = os.path.join(debug_root, f"method_ai_{ts}")
os.makedirs(debug_dir, exist_ok=True)
# Write a summary and individual documents
summary_lines: List[str] = [f"documents: {len(action_documents)}"]
for i, doc in enumerate(action_documents, 1):
summary_lines.append(f"doc[{i}]: name={doc.documentName}, mimeType={doc.mimeType}")
safe_name = doc.documentName or f"doc_{i:03d}.txt"
fpath = os.path.join(debug_dir, safe_name)
with open(fpath, "w", encoding="utf-8") as f:
f.write(str(doc.documentData) if doc.documentData is not None else "")
with open(os.path.join(debug_dir, "summary.txt"), "w", encoding="utf-8") as f:
f.write("\n".join(summary_lines))
except Exception:
pass
# Return result in the standard ActionResult format with parsed documents # Return result in the standard ActionResult format with parsed documents
return ActionResult.isSuccess( return ActionResult.isSuccess(
documents=action_documents documents=action_documents

View file

@ -527,7 +527,7 @@ class HandlingTasks:
user_prompt = extractUserPrompt(context) user_prompt = extractUserPrompt(context)
available_documents = extractAvailableDocuments(context) available_documents = extractAvailableDocuments(context)
user_language = extractUserLanguage(self.services) user_language = extractUserLanguage(self.services)
available_methods = extractAvailableMethods(self.service) available_methods = extractAvailableMethods(self.services)
# Create placeholders dictionary # Create placeholders dictionary
placeholders = { placeholders = {
@ -574,7 +574,7 @@ class HandlingTasks:
# Extract content for placeholders # Extract content for placeholders
user_prompt = extractUserPrompt(context) user_prompt = extractUserPrompt(context)
available_documents = extractAvailableDocuments(context) available_documents = extractAvailableDocuments(context)
user_language = extractUserLanguage(self.service) user_language = extractUserLanguage(self.services)
# Get action signature # Get action signature
method = action.get('method', '') method = action.get('method', '')
@ -1480,16 +1480,19 @@ class HandlingTasks:
# Always use the action's execResultLabel for message creation to ensure proper document routing # Always use the action's execResultLabel for message creation to ensure proper document routing
message_result_label = action.execResultLabel message_result_label = action.execResultLabel
# Create message first to get messageId, then create documents with messageId # Create documents first, then create the message with documents attached in one write
message = await self.createActionMessage(action, result, workflow, message_result_label, [], task_step, task_index, action_index, total_actions) created_documents = self.services.generation.createDocumentsFromActionResult(result, action, workflow, None)
if message: message = await self.createActionMessage(
# Now create documents with the messageId action,
created_documents = self.services.generation.createDocumentsFromActionResult(result, action, workflow, message.id) result,
# Update the message with the created documents workflow,
if created_documents: message_result_label,
message.documents = created_documents created_documents,
# Update the message in the database task_step,
self.services.interfaceDbChat.updateMessage(message.id, {"documents": [doc.dict() for doc in created_documents]}) task_index,
action_index,
total_actions
)
# Log action results # Log action results
logger.info(f"Action completed successfully") logger.info(f"Action completed successfully")