sharepoint implemented

This commit is contained in:
ValueOn AG 2025-09-04 01:01:21 +02:00
parent 8726cd4fb8
commit 5e00b4bd07
24 changed files with 1024 additions and 4608 deletions

View file

@ -341,7 +341,7 @@ class DocumentExtraction:
# Use documentUtility for mime type
mime_type = getMimeTypeFromExtension(getFileExtension(fileName), self._serviceCenter)
mime_type = getMimeTypeFromExtension(getFileExtension(fileName))
return [ContentItem(
label="main",
data=content,
@ -360,7 +360,7 @@ class DocumentExtraction:
"""Process CSV document with robust encoding detection"""
try:
content = self._robustTextDecode(fileData, fileName)
mime_type = getMimeTypeFromExtension(getFileExtension(fileName), self._serviceCenter)
mime_type = getMimeTypeFromExtension(getFileExtension(fileName))
return [ContentItem(
label="main",
data=content,
@ -380,7 +380,7 @@ class DocumentExtraction:
try:
content = self._robustTextDecode(fileData, fileName)
jsonData = json.loads(content)
mime_type = getMimeTypeFromExtension(getFileExtension(fileName), self._serviceCenter)
mime_type = getMimeTypeFromExtension(getFileExtension(fileName))
return [ContentItem(
label="main",
data=content,
@ -399,7 +399,7 @@ class DocumentExtraction:
"""Process XML document with robust encoding detection"""
try:
content = self._robustTextDecode(fileData, fileName)
mime_type = getMimeTypeFromExtension(getFileExtension(fileName), self._serviceCenter)
mime_type = getMimeTypeFromExtension(getFileExtension(fileName))
return [ContentItem(
label="main",
data=content,
@ -418,7 +418,7 @@ class DocumentExtraction:
"""Process HTML document with robust encoding detection"""
try:
content = self._robustTextDecode(fileData, fileName)
mime_type = getMimeTypeFromExtension(getFileExtension(fileName), self._serviceCenter)
mime_type = getMimeTypeFromExtension(getFileExtension(fileName))
return [ContentItem(
label="main",
data=content,
@ -512,7 +512,7 @@ class DocumentExtraction:
# Combine all meaningful content
final_content = "\n".join(meaningful_content)
mime_type = getMimeTypeFromExtension(getFileExtension(fileName), self._serviceCenter)
mime_type = getMimeTypeFromExtension(getFileExtension(fileName))
return [ContentItem(
label="svg_content",
data=final_content,

View file

@ -98,26 +98,12 @@ class DocumentGenerator:
logger.info(f"Document {document_name} has content: {len(content)} characters")
# Create file in system
file_id = self.service.createFile(
fileName=document_name,
mimeType=mime_type,
content=content,
base64encoded=False
)
if not file_id:
logger.error(f"Failed to create file for document {document_name}")
continue
logger.info(f"Created file with ID: {file_id}")
# Create document object using existing file ID
# Create document with file in one step
document = self.service.createDocument(
fileName=document_name,
mimeType=mime_type,
content=content,
base64encoded=False,
existing_file_id=file_id
base64encoded=False
)
if document:
# Set workflow context on the document if possible

View file

@ -1,51 +1,160 @@
import json
import logging
import os
from typing import Any, Dict
logger = logging.getLogger(__name__)
def getFileExtension(fileName: str) -> str:
"""Extract file extension from fileName"""
"""Extract file extension from fileName (without dot, lowercased)."""
if '.' in fileName:
return fileName.rsplit('.', 1)[-1].lower()
return ''
def getMimeTypeFromExtension(extension: str, service=None) -> str:
"""Get MIME type based on file extension. Optionally use a service for mapping."""
if service:
return service.getMimeTypeFromExtension(extension)
# Fallback mapping
mapping = {
def getMimeTypeFromExtension(extension: str) -> str:
"""
Get MIME type based on file extension.
This method consolidates MIME type detection from extension.
Args:
extension: File extension (with or without dot)
Returns:
str: MIME type for the extension
"""
# Normalize extension (remove dot if present)
if extension.startswith('.'):
extension = extension[1:]
# Map extensions to MIME types
mime_types = {
'txt': 'text/plain',
'md': 'text/markdown',
'html': 'text/html',
'css': 'text/css',
'js': 'application/javascript',
'json': 'application/json',
'csv': 'text/csv',
'xml': 'application/xml',
'csv': 'text/csv',
'html': 'text/html',
'htm': 'text/html',
'md': 'text/markdown',
'py': 'text/x-python',
'js': 'application/javascript',
'css': 'text/css',
'pdf': 'application/pdf',
'doc': 'application/msword',
'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'xls': 'application/vnd.ms-excel',
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'png': 'image/png',
'ppt': 'application/vnd.ms-powerpoint',
'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'svg': 'image/svg+xml',
'jpg': 'image/jpeg',
'jpeg': 'image/jpeg',
'png': 'image/png',
'gif': 'image/gif',
'svg': 'image/svg+xml',
'bmp': 'image/bmp',
'webp': 'image/webp',
'zip': 'application/zip',
'rar': 'application/x-rar-compressed',
'7z': 'application/x-7z-compressed',
'tar': 'application/x-tar',
'gz': 'application/gzip'
}
return mapping.get(extension.lower(), 'application/octet-stream')
return mime_types.get(extension.lower(), 'application/octet-stream')
def detectContentTypeFromData(fileData: bytes, fileName: str) -> str:
"""
Detect content type from file data and fileName.
This method makes the MIME type detection function accessible through the service center.
Args:
fileData: Raw file data as bytes
fileName: Name of the file
Returns:
str: Detected MIME type
"""
try:
# Check file extension first
ext = os.path.splitext(fileName)[1].lower()
if ext:
# Map common extensions to MIME types
extToMime = {
'.txt': 'text/plain',
'.md': 'text/markdown',
'.csv': 'text/csv',
'.json': 'application/json',
'.xml': 'application/xml',
'.js': 'application/javascript',
'.py': 'application/x-python',
'.svg': 'image/svg+xml',
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.bmp': 'image/bmp',
'.webp': 'image/webp',
'.pdf': 'application/pdf',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.doc': 'application/msword',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xls': 'application/vnd.ms-excel',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.ppt': 'application/vnd.ms-powerpoint',
'.html': 'text/html',
'.htm': 'text/html',
'.css': 'text/css',
'.zip': 'application/zip',
'.rar': 'application/x-rar-compressed',
'.7z': 'application/x-7z-compressed',
'.tar': 'application/x-tar',
'.gz': 'application/gzip'
}
if ext in extToMime:
return extToMime[ext]
# Try to detect from content
if fileData.startswith(b'%PDF'):
return 'application/pdf'
elif fileData.startswith(b'PK\x03\x04'):
# ZIP-based formats (docx, xlsx, pptx)
return 'application/zip'
elif fileData.startswith(b'<'):
# XML-based formats
try:
text = fileData.decode('utf-8', errors='ignore')
if '<svg' in text.lower():
return 'image/svg+xml'
elif '<html' in text.lower():
return 'text/html'
else:
return 'application/xml'
except:
pass
elif fileData.startswith(b'\x89PNG\r\n\x1a\n'):
return 'image/png'
elif fileData.startswith(b'\xff\xd8\xff'):
return 'image/jpeg'
elif fileData.startswith(b'GIF87a') or fileData.startswith(b'GIF89a'):
return 'image/gif'
elif fileData.startswith(b'BM'):
return 'image/bmp'
elif fileData.startswith(b'RIFF') and fileData[8:12] == b'WEBP':
return 'image/webp'
return 'application/octet-stream'
except Exception as e:
logger.error(f"Error detecting content type from data: {str(e)}")
return 'application/octet-stream'
def detectMimeTypeFromData(file_bytes: bytes, fileName: str, service=None) -> str:
"""Detect MIME type from file bytes and fileName using a service if provided."""
try:
if service:
if service and hasattr(service, 'detectContentTypeFromData'):
detected = service.detectContentTypeFromData(file_bytes, fileName)
if detected and detected != 'application/octet-stream':
return detected
# Fallback: guess from extension
ext = getFileExtension(fileName)
return getMimeTypeFromExtension(ext, service)
# Fallback: use our consolidated function
return detectContentTypeFromData(file_bytes, fileName)
except Exception as e:
logger.warning(f"Error in MIME type detection for {fileName}: {str(e)}")
return 'application/octet-stream'

View file

@ -108,7 +108,7 @@ class HandlingTasks:
# Log the full task planning prompt being sent to AI for debugging
logger.info("=== TASK PLANNING PROMPT SENT TO AI ===")
logger.info(f"User Input: {userInput}")
logger.info(f"Available Documents: {len(available_docs) if available_docs else 0}")
logger.info(f"Available Documents: {available_docs}")
logger.info("=== FULL TASK PLANNING PROMPT ===")
logger.info(task_planning_prompt)
logger.info("=== END TASK PLANNING PROMPT ===")
@ -312,12 +312,8 @@ class HandlingTasks:
# Log available resources for debugging
logger.info("=== AVAILABLE RESOURCES FOR ACTION GENERATION ===")
logger.info(f"Available Documents: {len(available_docs) if available_docs else 0}")
if available_docs:
for i, doc in enumerate(available_docs[:5]): # Show first 5
logger.info(f" Doc {i+1}: {doc}")
if len(available_docs) > 5:
logger.info(f" ... and {len(available_docs) - 5} more documents")
logger.info(f"Available Documents: {available_docs}")
# Note: available_docs is now a string description, not a list
logger.info(f"Available Connections: {len(available_connections) if available_connections else 0}")
if available_connections:
for i, conn in enumerate(available_connections[:5]): # Show first 5
@ -376,7 +372,7 @@ class HandlingTasks:
logger.info(f"Task Step ID: {action_context.task_step.id if action_context.task_step else 'None'}")
logger.info(f"Task Step Objective: {action_context.task_step.objective if action_context.task_step else 'None'}")
logger.info(f"Workflow ID: {action_context.workflow_id}")
logger.info(f"Available Documents Count: {len(action_context.available_documents) if action_context.available_documents else 0}")
logger.info(f"Available Documents: {action_context.available_documents or 'No documents available'}")
logger.info(f"Available Connections Count: {len(action_context.available_connections) if action_context.available_connections else 0}")
logger.info(f"Previous Results Count: {len(action_context.previous_results) if action_context.previous_results else 0}")
logger.info(f"Retry Count: {action_context.retry_count}")

View file

@ -1,3 +0,0 @@

View file

@ -20,13 +20,13 @@ def createTaskPlanningPrompt(context: TaskContext, service) -> str:
user_request = context.task_step.objective if context.task_step else 'No request specified'
# Extract available documents from context - use Pydantic model directly
available_documents = context.available_documents or []
available_documents = context.available_documents or "No documents available"
return f"""You are a task planning AI that analyzes user requests and creates structured task plans with user-friendly feedback messages.
USER REQUEST: {user_request}
AVAILABLE DOCUMENTS: {', '.join(available_documents)}
AVAILABLE DOCUMENTS: {available_documents}
INSTRUCTIONS:
1. Analyze the user request and available documents

View file

@ -14,6 +14,7 @@ from modules.interfaces.interfaceChatModel import ActionResult
from modules.interfaces.interfaceComponentObjects import getInterface as getComponentObjects
from modules.interfaces.interfaceAppObjects import getInterface as getAppObjects
from modules.chat.documents.documentExtraction import DocumentExtraction
from modules.chat.documents.documentUtility import getFileExtension, getMimeTypeFromExtension, detectContentTypeFromData
from modules.chat.methodBase import MethodBase
from modules.shared.timezoneUtils import get_utc_timestamp
import uuid
@ -111,165 +112,9 @@ class ServiceCenter:
except Exception as e:
logger.error(f"Error discovering methods: {str(e)}")
def detectContentTypeFromData(self, fileData: bytes, fileName: str) -> str:
"""
Detect content type from file data and fileName.
This method makes the MIME type detection function accessible through the service center.
Args:
fileData: Raw file data as bytes
fileName: Name of the file
Returns:
str: Detected MIME type
"""
try:
# Check file extension first
ext = os.path.splitext(fileName)[1].lower()
if ext:
# Map common extensions to MIME types
extToMime = {
'.txt': 'text/plain',
'.md': 'text/markdown',
'.csv': 'text/csv',
'.json': 'application/json',
'.xml': 'application/xml',
'.js': 'application/javascript',
'.py': 'application/x-python',
'.svg': 'image/svg+xml',
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.bmp': 'image/bmp',
'.webp': 'image/webp',
'.pdf': 'application/pdf',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.doc': 'application/msword',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xls': 'application/vnd.ms-excel',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.ppt': 'application/vnd.ms-powerpoint',
'.html': 'text/html',
'.htm': 'text/html',
'.css': 'text/css',
'.zip': 'application/zip',
'.rar': 'application/x-rar-compressed',
'.7z': 'application/x-7z-compressed',
'.tar': 'application/x-tar',
'.gz': 'application/gzip'
}
if ext in extToMime:
return extToMime[ext]
# Try to detect from content
if fileData.startswith(b'%PDF'):
return 'application/pdf'
elif fileData.startswith(b'PK\x03\x04'):
# ZIP-based formats (docx, xlsx, pptx)
return 'application/zip'
elif fileData.startswith(b'<'):
# XML-based formats
try:
text = fileData.decode('utf-8', errors='ignore')
if '<svg' in text.lower():
return 'image/svg+xml'
elif '<html' in text.lower():
return 'text/html'
else:
return 'application/xml'
except:
pass
elif fileData.startswith(b'\x89PNG\r\n\x1a\n'):
return 'image/png'
elif fileData.startswith(b'\xff\xd8\xff'):
return 'image/jpeg'
elif fileData.startswith(b'GIF87a') or fileData.startswith(b'GIF89a'):
return 'image/gif'
elif fileData.startswith(b'BM'):
return 'image/bmp'
elif fileData.startswith(b'RIFF') and fileData[8:12] == b'WEBP':
return 'image/webp'
return 'application/octet-stream'
except Exception as e:
logger.error(f"Error detecting content type from data: {str(e)}")
return 'application/octet-stream'
def getMimeTypeFromExtension(self, extension: str) -> str:
"""
Get MIME type based on file extension.
This method consolidates MIME type detection from extension.
Args:
extension: File extension (with or without dot)
Returns:
str: MIME type for the extension
"""
# Normalize extension (remove dot if present)
if extension.startswith('.'):
extension = extension[1:]
# Map extensions to MIME types
mime_types = {
'txt': 'text/plain',
'json': 'application/json',
'xml': 'application/xml',
'csv': 'text/csv',
'html': 'text/html',
'htm': 'text/html',
'md': 'text/markdown',
'py': 'text/x-python',
'js': 'application/javascript',
'css': 'text/css',
'pdf': 'application/pdf',
'doc': 'application/msword',
'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'xls': 'application/vnd.ms-excel',
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'ppt': 'application/vnd.ms-powerpoint',
'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'svg': 'image/svg+xml',
'jpg': 'image/jpeg',
'jpeg': 'image/jpeg',
'png': 'image/png',
'gif': 'image/gif',
'bmp': 'image/bmp',
'webp': 'image/webp',
'zip': 'application/zip',
'rar': 'application/x-rar-compressed',
'7z': 'application/x-7z-compressed',
'tar': 'application/x-tar',
'gz': 'application/gzip'
}
return mime_types.get(extension.lower(), 'application/octet-stream')
def getFileExtension(self, fileName: str) -> str:
"""
Extract file extension from fileName.
Args:
fileName: Name of the file
Returns:
str: File extension (without dot)
"""
if '.' in fileName:
return fileName.split('.')[-1].lower()
return "txt" # Default to text
def getFileExtension(self, fileName):
"""
Extract file extension from fileName (without dot, lowercased).
Returns empty string if no extension is found.
"""
if '.' in fileName:
return fileName.rsplit('.', 1)[-1].lower()
return ''
# ===== Functions =====
# ===== Functions for Prompts: Context =====
def getMethodsList(self) -> List[str]:
"""Get list of available methods with their signatures in the required format"""
@ -283,157 +128,48 @@ class ServiceCenter:
methodList.append(signature)
return methodList
def generateDocumentLabel(self, document: ChatDocument, message: ChatMessage) -> str:
"""Generate new document label: round+task+action+filename.extension"""
async def summarizeChat(self, messages: List[ChatMessage]) -> str:
"""
Summarize chat messages from last to first message with status="first"
Args:
messages: List of chat messages to summarize
Returns:
str: Summary of the chat in user's language
"""
try:
# Get workflow context from message
round_num = message.roundNumber if hasattr(message, 'roundNumber') else 1
task_num = message.taskNumber if hasattr(message, 'taskNumber') else 0
action_num = message.actionNumber if hasattr(message, 'actionNumber') else 0
# Get messages from last to first, stopping at first message with status="first"
relevantMessages = []
for msg in reversed(messages):
relevantMessages.append(msg)
if msg.status == "first":
break
# Get file extension from document's fileName property
try:
file_extension = self.getFileExtension(document.fileName)
filename = document.fileName
except Exception as e:
# Try to diagnose and recover the issue
diagnosis = self.diagnoseDocumentAccess(document)
logger.error(f"Critical error: Cannot access document fileName for document {document.id}. Diagnosis: {diagnosis}")
# Attempt recovery
if self.recoverDocumentAccess(document):
try:
file_extension = self.getFileExtension(document.fileName)
filename = document.fileName
logger.info(f"Document access recovered for {document.id}")
except Exception as recovery_error:
logger.error(f"Recovery failed for document {document.id}: {str(recovery_error)}")
raise RuntimeError(f"Document {document.id} is permanently inaccessible after recovery attempt: {str(recovery_error)}")
else:
# Recovery failed - don't continue with invalid data
raise RuntimeError(f"Document {document.id} is inaccessible and recovery failed. Diagnosis: {diagnosis}")
# Construct label: round1_task2_action3_filename.ext
if file_extension:
label = f"round{round_num}_task{task_num}_action{action_num}_{filename}"
else:
label = f"round{round_num}_task{task_num}_action{action_num}_{filename}"
return label
except Exception as e:
logger.error(f"Critical error generating document label for document {document.id}: {str(e)}")
# Re-raise the error to prevent workflow from continuing with invalid data
raise
# Create prompt for AI
prompt = f"""You are an AI assistant providing a summary of a chat conversation.
Please respond in '{self.user.language}' language.
def getDocumentReferenceList(self) -> Dict[str, List[DocumentExchange]]:
"""Get list of document exchanges with new labeling format, sorted by recency"""
# Collect all documents first and refresh their attributes
all_documents = []
for message in self.workflow.messages:
if message.documents:
all_documents.extend(message.documents)
# Refresh file attributes for all documents
if all_documents:
self.refreshDocumentFileAttributes(all_documents)
chat_exchanges = []
history_exchanges = []
# Process messages in reverse order; "first" marks boundary
in_current_round = True
for message in reversed(self.workflow.messages):
is_first = message.status == "first" if hasattr(message, 'status') else False
Chat History:
{chr(10).join(f"- {msg.message}" for msg in reversed(relevantMessages))}
Instructions:
1. Summarize the conversation's key points and outcomes
2. Be concise but informative
3. Use a professional but friendly tone
4. Focus on important decisions and next steps if any
Please provide a comprehensive summary of this conversation."""
# Build a DocumentExchange if message has documents
doc_exchange = None
if message.documents:
if message.actionId and message.documentsLabel:
# Validate that we use the same label as in the message
validated_label = self._validateDocumentLabelConsistency(message)
# Use the message's actual documentsLabel
doc_refs = []
for doc in message.documents:
doc_ref = self.getDocumentReferenceFromChatDocument(doc, message)
doc_refs.append(doc_ref)
doc_exchange = DocumentExchange(
documentsLabel=validated_label,
documents=doc_refs
)
else:
# Generate new labels for documents without explicit labels
doc_refs = []
for doc in message.documents:
doc_ref = self.getDocumentReferenceFromChatDocument(doc, message)
doc_refs.append(doc_ref)
if doc_refs:
# Create a label based on message context
round_num = message.roundNumber if hasattr(message, 'roundNumber') else 1
task_num = message.taskNumber if hasattr(message, 'taskNumber') else 0
action_num = message.actionNumber if hasattr(message, 'actionNumber') else 0
context_label = f"round{round_num}_task{task_num}_action{action_num}_context"
doc_exchange = DocumentExchange(
documentsLabel=context_label,
documents=doc_refs
)
# Get summary using AI
return await self.callAiTextBasic(prompt)
# Append to appropriate container based on boundary
if doc_exchange:
if in_current_round:
chat_exchanges.append(doc_exchange)
else:
history_exchanges.append(doc_exchange)
# Flip boundary after including the "first" message in chat
if in_current_round and is_first:
in_current_round = False
# Sort by recency: most recent first, then current round, then earlier rounds
# Sort chat exchanges by message sequence number (most recent first)
chat_exchanges.sort(key=lambda x: self._getMessageSequenceForExchange(x), reverse=True)
# Sort history exchanges by message sequence number (most recent first)
history_exchanges.sort(key=lambda x: self._getMessageSequenceForExchange(x), reverse=True)
return {
"chat": chat_exchanges,
"history": history_exchanges
}
except Exception as e:
logger.error(f"Error summarizing chat: {str(e)}")
return f"Error summarizing chat: {str(e)}"
# ===== Functions for Prompts + Actions: Document References generation and resolution =====
def _getMessageSequenceForExchange(self, exchange: DocumentExchange) -> int:
"""Get message sequence number for sorting exchanges by recency"""
try:
# Extract message ID from the first document reference
if exchange.documents and len(exchange.documents) > 0:
first_doc_ref = exchange.documents[0]
if first_doc_ref.startswith("docItem:"):
# docItem:<id>:<label> - extract ID
parts = first_doc_ref.split(':')
if len(parts) >= 2:
doc_id = parts[1]
# Find the message containing this document
for message in self.workflow.messages:
if message.documents:
for doc in message.documents:
if doc.id == doc_id:
return message.sequenceNr if hasattr(message, 'sequenceNr') else 0
elif first_doc_ref.startswith("docList:"):
# docList:<message_id>:<label> - extract message ID
parts = first_doc_ref.split(':')
if len(parts) >= 2:
message_id = parts[1]
# Find the message by ID
for message in self.workflow.messages:
if str(message.id) == message_id:
return message.sequenceNr if hasattr(message, 'sequenceNr') else 0
return 0
except Exception as e:
logger.error(f"Error getting message sequence for exchange: {str(e)}")
return 0
def getEnhancedDocumentContext(self) -> str:
"""Get enhanced document context formatted for action planning prompts with proper docList and docItem references"""
try:
@ -509,6 +245,144 @@ class ServiceCenter:
logger.error(f"Error generating enhanced document context: {str(e)}")
return "NO DOCUMENTS AVAILABLE - Error generating document context."
def getDocumentReferenceList(self) -> Dict[str, List[DocumentExchange]]:
"""Get list of document exchanges with new labeling format, sorted by recency"""
# Collect all documents first and refresh their attributes
all_documents = []
for message in self.workflow.messages:
if message.documents:
all_documents.extend(message.documents)
# Refresh file attributes for all documents
if all_documents:
self._refreshDocumentFileAttributes(all_documents)
chat_exchanges = []
history_exchanges = []
# Process messages in reverse order; "first" marks boundary
in_current_round = True
for message in reversed(self.workflow.messages):
is_first = message.status == "first" if hasattr(message, 'status') else False
# Build a DocumentExchange if message has documents
doc_exchange = None
if message.documents:
if message.actionId and message.documentsLabel:
# Validate that we use the same label as in the message
validated_label = self._validateDocumentLabelConsistency(message)
# Use the message's actual documentsLabel
doc_refs = []
for doc in message.documents:
doc_ref = self._getDocumentReferenceFromChatDocument(doc, message)
doc_refs.append(doc_ref)
doc_exchange = DocumentExchange(
documentsLabel=validated_label,
documents=doc_refs
)
else:
# Generate new labels for documents without explicit labels
doc_refs = []
for doc in message.documents:
doc_ref = self._getDocumentReferenceFromChatDocument(doc, message)
doc_refs.append(doc_ref)
if doc_refs:
# Create a label based on message context
context_prefix = self._generateWorkflowContextPrefix(message)
context_label = f"{context_prefix}_context"
doc_exchange = DocumentExchange(
documentsLabel=context_label,
documents=doc_refs
)
# Append to appropriate container based on boundary
if doc_exchange:
if in_current_round:
chat_exchanges.append(doc_exchange)
else:
history_exchanges.append(doc_exchange)
# Flip boundary after including the "first" message in chat
if in_current_round and is_first:
in_current_round = False
# Sort by recency: most recent first, then current round, then earlier rounds
# Sort chat exchanges by message sequence number (most recent first)
chat_exchanges.sort(key=lambda x: self._getMessageSequenceForExchange(x), reverse=True)
# Sort history exchanges by message sequence number (most recent first)
history_exchanges.sort(key=lambda x: self._getMessageSequenceForExchange(x), reverse=True)
return {
"chat": chat_exchanges,
"history": history_exchanges
}
def _refreshDocumentFileAttributes(self, documents: List[ChatDocument]) -> None:
"""Update file attributes (fileName, fileSize, mimeType) for documents"""
for doc in documents:
try:
file_item = self.interfaceComponent.getFile(doc.fileId)
if file_item:
doc.fileName = file_item.fileName
doc.fileSize = file_item.fileSize
doc.mimeType = file_item.mimeType
else:
logger.warning(f"File not found for document {doc.id}, fileId: {doc.fileId}")
except Exception as e:
logger.error(f"Error refreshing file attributes for document {doc.id}: {e}")
def _generateWorkflowContextPrefix(self, message: ChatMessage) -> str:
"""Generate workflow context prefix: round{num}_task{num}_action{num}"""
round_num = message.roundNumber if hasattr(message, 'roundNumber') else 1
task_num = message.taskNumber if hasattr(message, 'taskNumber') else 0
action_num = message.actionNumber if hasattr(message, 'actionNumber') else 0
return f"round{round_num}_task{task_num}_action{action_num}"
def _getDocumentReferenceFromChatDocument(self, document: ChatDocument, message: ChatMessage) -> str:
"""Get document reference using document ID and filename."""
try:
# Use document ID and filename for simple reference
return f"docItem:{document.id}:{document.fileName}"
except Exception as e:
logger.error(f"Critical error creating document reference for document {document.id}: {str(e)}")
# Re-raise the error to prevent workflow from continuing with invalid data
raise
def _getMessageSequenceForExchange(self, exchange: DocumentExchange) -> int:
"""Get message sequence number for sorting exchanges by recency"""
try:
# Extract message ID from the first document reference
if exchange.documents and len(exchange.documents) > 0:
first_doc_ref = exchange.documents[0]
if first_doc_ref.startswith("docItem:"):
# docItem:<id>:<label> - extract ID
parts = first_doc_ref.split(':')
if len(parts) >= 2:
doc_id = parts[1]
# Find the message containing this document
for message in self.workflow.messages:
if message.documents:
for doc in message.documents:
if doc.id == doc_id:
return message.sequenceNr if hasattr(message, 'sequenceNr') else 0
elif first_doc_ref.startswith("docList:"):
# docList:<message_id>:<label> - extract message ID
parts = first_doc_ref.split(':')
if len(parts) >= 2:
message_id = parts[1]
# Find the message by ID
for message in self.workflow.messages:
if str(message.id) == message_id:
return message.sequenceNr if hasattr(message, 'sequenceNr') else 0
return 0
except Exception as e:
logger.error(f"Error getting message sequence for exchange: {str(e)}")
return 0
def _validateDocumentLabelConsistency(self, message) -> str:
"""Validate that the document label used for references matches the message's actual label"""
if not hasattr(message, 'documentsLabel') or not message.documentsLabel:
@ -571,27 +445,6 @@ class ServiceCenter:
logger.error(f"Error extracting document info from reference: {str(e)}")
return None
def getDocumentReferenceFromChatDocument(self, document: ChatDocument, message: ChatMessage) -> str:
"""Get document reference using document ID and filename."""
try:
# Use document ID and filename for simple reference
return f"docItem:{document.id}:{document.fileName}"
except Exception as e:
logger.error(f"Critical error creating document reference for document {document.id}: {str(e)}")
# Re-raise the error to prevent workflow from continuing with invalid data
raise
def getDocumentListReferenceFromChatMessage(self, message: ChatMessage) -> str:
"""Get document list reference using message ID and label."""
try:
# Use message ID and documentsLabel for document list reference
label = getattr(message, 'documentsLabel', f"message_{message.id}")
return f"docList:{message.id}:{label}"
except Exception as e:
logger.error(f"Critical error creating document list reference for message {message.id}: {str(e)}")
# Re-raise the error to prevent workflow from continuing with invalid data
raise
def getChatDocumentsFromDocumentList(self, documentList: List[str]) -> List[ChatDocument]:
"""Get ChatDocuments from a list of document references using all three formats."""
try:
@ -731,6 +584,8 @@ class ServiceCenter:
logger.error(f"Error getting documents from document list: {str(e)}")
return []
# ===== Functions for Prompts + Actions: Connection References generation and resolution =====
def getConnectionReferenceList(self) -> List[str]:
"""Get list of all UserConnection objects as references with enhanced state information"""
connections = []
@ -827,46 +682,8 @@ class ServiceCenter:
logger.error(f"Error parsing connection reference: {str(e)}")
return None
async def summarizeChat(self, messages: List[ChatMessage]) -> str:
"""
Summarize chat messages from last to first message with status="first"
Args:
messages: List of chat messages to summarize
# ===== Functions for Actions: AI calls =====
Returns:
str: Summary of the chat in user's language
"""
try:
# Get messages from last to first, stopping at first message with status="first"
relevantMessages = []
for msg in reversed(messages):
relevantMessages.append(msg)
if msg.status == "first":
break
# Create prompt for AI
prompt = f"""You are an AI assistant providing a summary of a chat conversation.
Please respond in '{self.user.language}' language.
Chat History:
{chr(10).join(f"- {msg.message}" for msg in reversed(relevantMessages))}
Instructions:
1. Summarize the conversation's key points and outcomes
2. Be concise but informative
3. Use a professional but friendly tone
4. Focus on important decisions and next steps if any
Please provide a comprehensive summary of this conversation."""
# Get summary using AI
return await self.callAiTextBasic(prompt)
except Exception as e:
logger.error(f"Error summarizing chat: {str(e)}")
return f"Error summarizing chat: {str(e)}"
async def callAiTextAdvanced(self, prompt: str, context: str = None) -> str:
"""Advanced text processing using Anthropic, with fallback to OpenAI basic if advanced fails."""
max_retries = 3
@ -959,6 +776,8 @@ Please provide a comprehensive summary of this conversation."""
return response
# ===== Functions for Actions: Data management =====
def getFileInfo(self, fileId: str) -> Dict[str, Any]:
"""Get file information"""
file_item = self.interfaceComponent.getFile(fileId)
@ -997,11 +816,11 @@ Please provide a comprehensive summary of this conversation."""
mimeType = document.mimeType
except Exception as e:
# Try to diagnose and recover the issue
diagnosis = self.diagnoseDocumentAccess(document)
diagnosis = self._diagnoseDocumentAccess(document)
logger.error(f"Critical error: Cannot access document properties for document {document.id}. Diagnosis: {diagnosis}")
# Attempt recovery
if self.recoverDocumentAccess(document):
if self._recoverDocumentAccess(document):
try:
fileName = document.fileName
mimeType = document.mimeType
@ -1031,9 +850,78 @@ Please provide a comprehensive summary of this conversation."""
except Exception as e:
logger.error(f"Error extracting from document: {str(e)}")
raise
def createFile(self, fileName: str, mimeType: str, content: str, base64encoded: bool = False) -> str:
"""Create new file and return its ID"""
def _diagnoseDocumentAccess(self, document: ChatDocument) -> Dict[str, Any]:
"""
Diagnose document access issues and provide recovery information.
This method helps identify why document properties are inaccessible.
"""
try:
diagnosis = {
'document_id': document.id,
'file_id': document.fileId,
'has_component_interface': document._componentInterface is not None,
'component_interface_type': type(document._componentInterface).__name__ if document._componentInterface else None,
'file_exists': False,
'file_info': None,
'error_details': None
}
# Check if component interface is set
if not document._componentInterface:
diagnosis['error_details'] = "Component interface not set - document cannot access file system"
return diagnosis
# Try to access the file directly
try:
file_info = self.interfaceComponent.getFile(document.fileId)
if file_info:
diagnosis['file_exists'] = True
diagnosis['file_info'] = {
'fileName': file_info.fileName if hasattr(file_info, 'fileName') else 'N/A',
'fileSize': file_info.fileSize if hasattr(file_info, 'fileSize') else 'N/A',
'mimeType': file_info.mimeType if hasattr(file_info, 'mimeType') else 'N/A'
}
else:
diagnosis['error_details'] = f"File with ID {document.fileId} not found in component interface"
except Exception as e:
diagnosis['error_details'] = f"Error accessing file {document.fileId}: {str(e)}"
return diagnosis
except Exception as e:
return {
'document_id': document.id if hasattr(document, 'id') else 'unknown',
'file_id': document.fileId if hasattr(document, 'fileId') else 'unknown',
'error_details': f"Error during diagnosis: {str(e)}"
}
def _recoverDocumentAccess(self, document: ChatDocument) -> bool:
"""
Attempt to recover document access by re-setting the component interface.
Returns True if recovery was successful.
"""
try:
logger.info(f"Attempting to recover document access for document {document.id}")
# Re-set the component interface
document.setComponentInterface(self.interfaceComponent)
# Test if we can now access the fileName
try:
test_fileName = document.fileName
logger.info(f"Document access recovered for {document.id} -> {test_fileName}")
return True
except Exception as e:
logger.error(f"Document access recovery failed for {document.id}: {str(e)}")
return False
except Exception as e:
logger.error(f"Error during document access recovery for {document.id}: {str(e)}")
return False
def createDocument(self, fileName: str, mimeType: str, content: str, base64encoded: bool = True) -> ChatDocument:
"""Create document with file in one step - handles file creation internally"""
# Convert content to bytes based on base64 flag
if base64encoded:
import base64
@ -1051,27 +939,16 @@ Please provide a comprehensive summary of this conversation."""
# Then store the file data
self.interfaceComponent.createFileData(file_item.id, content_bytes)
return file_item.id
def createDocument(self, fileName: str, mimeType: str, content: str, base64encoded: bool = True, existing_file_id: str = None) -> ChatDocument:
"""Create document AND file from file data object created by AI call"""
# Use existing file ID if provided, otherwise create new file
if existing_file_id:
file_id = existing_file_id
else:
# First create the file and get its ID
file_id = self.createFile(fileName, mimeType, content, base64encoded)
# Get file info to copy attributes
file_info = self.getFileInfo(file_id)
file_info = self.getFileInfo(file_item.id)
if not file_info:
logger.error(f"Could not get file info for fileId: {file_id}")
raise ValueError(f"File info not found for fileId: {file_id}")
logger.error(f"Could not get file info for fileId: {file_item.id}")
raise ValueError(f"File info not found for fileId: {file_item.id}")
# Create document with all file attributes copied
document = ChatDocument(
id=str(uuid.uuid4()),
fileId=file_id,
fileId=file_item.id,
fileName=file_info.get("fileName", fileName),
fileSize=file_info.get("size", 0),
mimeType=file_info.get("mimeType", mimeType)
@ -1079,6 +956,8 @@ Please provide a comprehensive summary of this conversation."""
return document
# ===== Internal public helper functions =====
def updateWorkflowStats(self, eventLabel: str = None, bytesSent: int = 0, bytesReceived: int = 0, tokenCount: int = 0) -> None:
"""
Centralized function to update workflow statistics in database and running workflow.
@ -1128,24 +1007,40 @@ Please provide a comprehensive summary of this conversation."""
logger.error(f"Error calculating object size: {str(e)}")
return 0
def getAvailableDocuments(self, workflow) -> List[str]:
def getAvailableDocuments(self, workflow) -> str:
"""
Get list of available document fileNames from workflow with new labeling format.
Get simple description of available documents for task planning.
Args:
workflow: ChatWorkflow object
Returns:
List[str]: List of document labels in new format
str: Simple description of document availability
"""
documents = []
total_documents = 0
document_types = set()
for message in workflow.messages:
for doc in message.documents:
# Generate new label format
label = self.generateDocumentLabel(doc, message)
documents.append(label)
return documents
if message.documents:
total_documents += len(message.documents)
for doc in message.documents:
try:
file_extension = getFileExtension(doc.fileName)
if file_extension:
document_types.add(file_extension.upper())
except:
pass
if total_documents == 0:
return "No documents available"
elif len(document_types) == 0:
return f"{total_documents} document(s) available"
else:
types_str = ", ".join(sorted(document_types))
return f"{total_documents} document(s) available ({types_str} files)"
# ===== Functions for Manager: Execution Tools =====
async def executeAction(self, methodName: str, actionName: str, parameters: Dict[str, Any]) -> ActionResult:
"""Execute a method action"""
try:
@ -1193,6 +1088,8 @@ Please provide a comprehensive summary of this conversation."""
"""Set user language for the service center"""
self.user.language = language
# ===== Functions for Manager: Workflow Tools =====
def setWorkflowContext(self, round_number: int = None, task_number: int = None, action_number: int = None):
"""Set current workflow context for document generation and routing"""
try:
@ -1287,88 +1184,5 @@ Please provide a comprehensive summary of this conversation."""
'workflowId': 'unknown'
}
def refreshDocumentFileAttributes(self, documents: List[ChatDocument]) -> None:
"""Update file attributes (fileName, fileSize, mimeType) for documents"""
for doc in documents:
try:
file_item = self.interfaceComponent.getFile(doc.fileId)
if file_item:
doc.fileName = file_item.fileName
doc.fileSize = file_item.fileSize
doc.mimeType = file_item.mimeType
else:
logger.warning(f"File not found for document {doc.id}, fileId: {doc.fileId}")
except Exception as e:
logger.error(f"Error refreshing file attributes for document {doc.id}: {e}")
def diagnoseDocumentAccess(self, document: ChatDocument) -> Dict[str, Any]:
"""
Diagnose document access issues and provide recovery information.
This method helps identify why document properties are inaccessible.
"""
try:
diagnosis = {
'document_id': document.id,
'file_id': document.fileId,
'has_component_interface': document._componentInterface is not None,
'component_interface_type': type(document._componentInterface).__name__ if document._componentInterface else None,
'file_exists': False,
'file_info': None,
'error_details': None
}
# Check if component interface is set
if not document._componentInterface:
diagnosis['error_details'] = "Component interface not set - document cannot access file system"
return diagnosis
# Try to access the file directly
try:
file_info = self.interfaceComponent.getFile(document.fileId)
if file_info:
diagnosis['file_exists'] = True
diagnosis['file_info'] = {
'fileName': file_info.fileName if hasattr(file_info, 'fileName') else 'N/A',
'fileSize': file_info.fileSize if hasattr(file_info, 'fileSize') else 'N/A',
'mimeType': file_info.mimeType if hasattr(file_info, 'mimeType') else 'N/A'
}
else:
diagnosis['error_details'] = f"File with ID {document.fileId} not found in component interface"
except Exception as e:
diagnosis['error_details'] = f"Error accessing file {document.fileId}: {str(e)}"
return diagnosis
except Exception as e:
return {
'document_id': document.id if hasattr(document, 'id') else 'unknown',
'file_id': document.fileId if hasattr(document, 'fileId') else 'unknown',
'error_details': f"Error during diagnosis: {str(e)}"
}
def recoverDocumentAccess(self, document: ChatDocument) -> bool:
"""
Attempt to recover document access by re-setting the component interface.
Returns True if recovery was successful.
"""
try:
logger.info(f"Attempting to recover document access for document {document.id}")
# Re-set the component interface
document.setComponentInterface(self.interfaceComponent)
# Test if we can now access the fileName
try:
test_fileName = document.fileName
logger.info(f"Document access recovered for {document.id} -> {test_fileName}")
return True
except Exception as e:
logger.error(f"Document access recovery failed for {document.id}: {str(e)}")
return False
except Exception as e:
logger.error(f"Error during document access recovery for {document.id}: {str(e)}")
return False
# Create singleton instance
serviceObject = None

View file

@ -732,7 +732,7 @@ class TaskContext(BaseModel, ModelMixin):
workflow_id: Optional[str] = None
# Available resources
available_documents: Optional[list[str]] = []
available_documents: Optional[str] = "No documents available"
available_connections: Optional[list[str]] = []
# Previous execution state
@ -755,8 +755,8 @@ class TaskContext(BaseModel, ModelMixin):
criteria_progress: Optional[dict] = None
def getDocumentReferences(self) -> List[str]:
"""Get all available document references"""
docs = self.available_documents or []
"""Get all available document references from previous handover"""
docs = []
if self.previous_handover:
for doc_exchange in self.previous_handover.inputDocuments:
docs.extend(doc_exchange.documents)

View file

@ -731,8 +731,6 @@ class MethodOutlook(MethodBase):
attachment_docs = self.service.getChatDocumentsFromDocumentList([attachment_ref])
if attachment_docs:
for doc in attachment_docs:
# Get the actual file content using fileId
file_id = getattr(doc, 'fileId', None)
if file_id:
@ -757,15 +755,15 @@ class MethodOutlook(MethodBase):
"contentBytes": base64_content
}
message["attachments"].append(attachment)
else:
logger.warning(f"No content found for attachment: {doc.fileName}")
except Exception as e:
logger.error(f"Error reading attachment file {doc.fileName}: {str(e)}")
else:
logger.warning(f"Attachment document has no fileId: {doc.fileName}")
else:
logger.warning(f"No attachment documents found for reference: {attachment_ref}")
else:
logger.warning(f"No attachment documents found for reference: {attachment_ref}")
# Create the draft message
# First, get the Drafts folder ID to ensure the draft is created there

File diff suppressed because it is too large Load diff

View file

@ -141,7 +141,7 @@ class WorkflowManager:
self.chatManager.handlingTasks._checkWorkflowStopped()
# Create initial message using interface
# Generate the correct documentsLabel that matches what getDocumentReferenceList() will create
# Generate the correct documentsLabel that matches what getDocumentReferenceString will create
round_num = workflow.currentRound
task_num = 0
action_num = 0

View file

@ -2,7 +2,9 @@
TODO
# System
- web
- sharepoint to fix
- document handling centralized
- ai handling centralized
- neutralizer to activate AND put back placeholders to the returned data
# Tests

View file

@ -1,128 +0,0 @@
<!DOCTYPE html>
<html lang="de">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Management Summary: Methoden-basierte Chat-Architektur</title>
<style>
body {
font-family: Arial, sans-serif;
line-height: 1.6;
max-width: 800px;
margin: 0 auto;
padding: 20px;
color: #333;
}
h1 {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 10px;
}
h2 {
color: #2c3e50;
margin-top: 30px;
}
.example {
background-color: #f8f9fa;
border-left: 4px solid #3498db;
padding: 15px;
margin: 20px 0;
}
.old-arch, .new-arch {
margin: 15px 0;
padding: 15px;
border-radius: 5px;
}
.old-arch {
background-color: #fff3cd;
border: 1px solid #ffeeba;
}
.new-arch {
background-color: #d4edda;
border: 1px solid #c3e6cb;
}
.benefits {
background-color: #e8f4f8;
padding: 15px;
border-radius: 5px;
margin: 20px 0;
}
.benefits ul {
margin: 10px 0;
padding-left: 20px;
}
.benefits li {
margin: 5px 0;
}
</style>
</head>
<body>
<h1>Management Summary: Methoden-basierte Chat-Architektur</h1>
<p>Die Umstellung von einer Agenten-basierten auf eine Methoden-basierte Chat-Architektur stellt einen fundamentalen Paradigmenwechsel dar. Während die Mehrheit der KI-Chat-Systeme weiterhin auf Agenten-Architekturen setzt, ermöglicht unser methoden-basierter Ansatz eine präzisere Kontrolle und effizientere Integration.</p>
<p>Der methoden-basierte Ansatz definiert klare, selbstbeschreibende Operationen mit festgelegten Parametern und Ergebnissen. Im Gegensatz zu Agenten, die als Blackbox-Operationen fungieren, bieten Methoden eine transparente, validierbare und vorhersehbare Ausführung. Diese Struktur ermöglicht eine präzise Fehlerbehandlung und Retry-Logik auf Aktions-Ebene, anstatt auf Agenten-Ebene.</p>
<p>Die Integration mit Benutzerdaten erfolgt direkt über definierte Authentifizierungspfade, was die Sicherheit erhöht und die Komplexität reduziert. Jede Methode ist selbstbeschreibend und enthält ihre eigenen Validierungsregeln, was die Wartbarkeit verbessert und die Entwicklung neuer Funktionen beschleunigt.</p>
<p>Der methoden-basierte Ansatz reduziert die KI-Abhängigkeit bei der Ausführung von Operationen, während die KI weiterhin für die Planung und Koordination der Methoden eingesetzt wird. Diese Trennung von Planung und Ausführung führt zu zuverlässigeren Ergebnissen und besserer Nachvollziehbarkeit.</p>
<p>Die Architektur ermöglicht eine präzise Dokumentation und Validierung jeder Operation, was in einer regulierten Umgebung von besonderem Wert ist. Die klare Struktur erleichtert die Integration neuer Dienste und die Erweiterung bestehender Funktionalitäten.</p>
<h2>Praktisches Beispiel: Dokumentenverarbeitung und E-Mail-Versand</h2>
<div class="example">
<div class="old-arch">
<strong>Alte Agenten-basierte Architektur:</strong><br>
<pre>
Benutzer: "Suche nach Verträgen im SharePoint und sende mir eine Zusammenfassung per E-Mail"
Agent SharePoint:
- Sucht nach Verträgen
- Extrahiert Inhalte
- Speichert Ergebnisse
Agent Outlook:
- Liest Ergebnisse
- Erstellt E-Mail
- Sendet E-Mail</pre>
</div>
<div class="new-arch">
<strong>Neue Methoden-basierte Architektur:</strong><br>
<pre>
Benutzer: "Suche nach Verträgen im SharePoint und sende mir eine Zusammenfassung per E-Mail"
Methoden-Katalog:
1. SharePoint.searchDocuments
- Parameter: {query: "Verträge", site: "valueon"}
- Retry: 3x bei Netzwerkfehler
- Auth: MSFT
2. Document.extractContent
- Parameter: {documents: [...], sections: ["Zusammenfassung"]}
- Retry: 2x bei Extraktionsfehler
- Auth: LOCAL
3. Outlook.sendMail
- Parameter: {to: ["user@example.com"], subject: "Vertragszusammenfassung"}
- Retry: 1x bei SMTP-Fehler
- Auth: MSFT</pre>
</div>
<div class="benefits">
<strong>Vorteile im Beispiel:</strong>
<ul>
<li>Jede Operation ist klar definiert und validierbar</li>
<li>Retry-Logik ist spezifisch für jede Operation</li>
<li>Authentifizierung ist explizit definiert</li>
<li>Fehler können präzise zugeordnet werden</li>
<li>Operationen können unabhängig voneinander getestet werden</li>
<li>Neue Operationen können einfach hinzugefügt werden</li>
</ul>
</div>
</div>
<p>Die KI plant die Ausführung dieser Methoden, aber die eigentliche Ausführung erfolgt durch die definierten Methoden mit klaren Parametern und Ergebnissen. Dies führt zu einer zuverlässigeren und besser nachvollziehbaren Ausführung.</p>
</body>
</html>

View file

@ -1,129 +0,0 @@
<!DOCTYPE html>
<html lang="de">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Management Summary: Methoden-basierte Chat-Architektur</title>
<style>
body {
font-family: Arial, sans-serif;
line-height: 1.6;
max-width: 800px;
margin: 0 auto;
padding: 20px;
color: #333;
}
h1 {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 10px;
}
h2 {
color: #2c3e50;
margin-top: 30px;
}
.example {
background-color: #f8f9fa;
border-left: 4px solid #3498db;
padding: 15px;
margin: 20px 0;
}
.old-arch, .new-arch {
margin: 15px 0;
padding: 15px;
border-radius: 5px;
}
.old-arch {
background-color: #fff3cd;
border: 1px solid #ffeeba;
}
.new-arch {
background-color: #d4edda;
border: 1px solid #c3e6cb;
}
.benefits {
background-color: #e8f4f8;
padding: 15px;
border-radius: 5px;
margin: 20px 0;
}
.benefits ul {
margin: 10px 0;
padding-left: 20px;
}
.benefits li {
margin: 5px 0;
}
</style>
</head>
<body>
<h1>Management Summary: Methoden-basierte Chat-Architektur</h1>
<p>Die Umstellung von einer Agenten-basierten auf eine Methoden-basierte Chat-Architektur stellt einen fundamentalen Paradigmenwechsel dar. Während die Mehrheit der KI-Chat-Systeme weiterhin auf Agenten-Architekturen setzt, ermöglicht unser methoden-basierter Ansatz eine präzisere Kontrolle und effizientere Integration.</p>
<p>Der methoden-basierte Ansatz definiert klare, selbstbeschreibende Operationen mit festgelegten Parametern und Ergebnissen. Im Gegensatz zu Agenten, die als Blackbox-Operationen fungieren, bieten Methoden eine transparente, validierbare und vorhersehbare Ausführung. Diese Struktur ermöglicht eine präzise Fehlerbehandlung und Retry-Logik auf Aktions-Ebene, anstatt auf Agenten-Ebene.</p>
<p>Die Integration mit Benutzerdaten erfolgt direkt über definierte Authentifizierungspfade, was die Sicherheit erhöht und die Komplexität reduziert. Jede Methode ist selbstbeschreibend und enthält ihre eigenen Validierungsregeln, was die Wartbarkeit verbessert und die Entwicklung neuer Funktionen beschleunigt.</p>
<p>Der methoden-basierte Ansatz reduziert die KI-Abhängigkeit bei der Ausführung von Operationen, während die KI weiterhin für die Planung und Koordination der Methoden eingesetzt wird. Diese Trennung von Planung und Ausführung führt zu zuverlässigeren Ergebnissen und besserer Nachvollziehbarkeit.</p>
<p>Die Architektur ermöglicht eine präzise Dokumentation und Validierung jeder Operation, was in einer regulierten Umgebung von besonderem Wert ist. Die klare Struktur erleichtert die Integration neuer Dienste und die Erweiterung bestehender Funktionalitäten.</p>
<h2>Praktisches Beispiel: Dokumentenverarbeitung und E-Mail-Versand</h2>
<div class="example">
<div class="old-arch">
<strong>Alte Agenten-basierte Architektur:</strong><br>
<pre>
Benutzer: "Suche nach Verträgen im SharePoint und sende mir eine Zusammenfassung per E-Mail"
Agent SharePoint:
- Sucht nach Verträgen
- Extrahiert Inhalte
- Speichert Ergebnisse
Agent Outlook:
- Liest Ergebnisse
- Erstellt E-Mail
- Sendet E-Mail</pre>
</div>
<div class="new-arch">
<strong>Neue Methoden-basierte Architektur:</strong><br>
<pre>
Benutzer: "Suche nach Verträgen im SharePoint und sende mir eine Zusammenfassung per E-Mail"
Methoden-Katalog:
1. SharePoint.searchDocuments
- Parameter: {query: "Verträge", site: "valueon"}
- Retry: 3x bei Netzwerkfehler
- Auth: MSFT
2. Document.extractContent
- Parameter: {documents: [...], sections: ["Zusammenfassung"]}
- Retry: 2x bei Extraktionsfehler
- Auth: LOCAL
3. Outlook.sendMail
- Parameter: {to: ["user@example.com"], subject: "Vertragszusammenfassung"}
- Retry: 1x bei SMTP-Fehler
- Auth: MSFT</pre>
</div>
<div class="benefits">
<strong>Vorteile im Beispiel:</strong>
<ul>
<li>Jede Operation ist klar definiert und validierbar</li>
<li>Retry-Logik ist spezifisch für jede Operation</li>
<li>Authentifizierung ist explizit definiert</li>
<li>Fehler können präzise zugeordnet werden</li>
<li>Operationen können unabhängig voneinander getestet werden</li>
<li>Neue Operationen können einfach hinzugefügt werden</li>
</ul>
</div>
</div>
<p>Die KI plant die Ausführung dieser Methoden, aber die eigentliche Ausführung erfolgt durch die definierten Methoden mit klaren Parametern und Ergebnissen. Dies führt zu einer zuverlässigeren und besser nachvollziehbaren Ausführung.</p>
</body>
</html>

View file

@ -1,999 +0,0 @@
# Chat System Process Flow Specification
## 1. System Overview
### 1.1 Core Components
- **WorkflowManager**: Orchestrates the overall workflow process
- **ChatManager**: Manages chat interactions and task execution
- **ServiceCenter**: Central state and context management
- **AgentTask**: Core data object for task execution
### 1.2 Service center Structure
```python
from enum import Enum
from typing import Dict, List, Optional, Any, Literal
from datetime import datetime, UTC
from pydantic import BaseModel, Field
class TaskStatus(str, Enum):
PENDING = "pending"
SUCCESS = "success"
FAILED = "failed"
RETRY = "retry"
TIMEOUT = "timeout"
ROLLBACK = "rollback"
class ActionStatus(str, Enum):
PENDING = "pending"
SUCCESS = "success"
FAILED = "failed"
RETRY = "retry"
TIMEOUT = "timeout"
SKIPPED = "skipped"
DEPENDENCY_FAILED = "dependency_failed"
class AuthSource(str, Enum):
LOCAL = "local"
MSFT = "msft"
GOOGLE = "google"
# Add more auth sources as needed
class MethodParameter(BaseModel):
"""Model for method parameters"""
name: str
type: str
required: bool
validation: Optional[callable] = None
description: str
class ActionResult(BaseModel):
"""Model for method results"""
success: bool
data: Dict[str, Any]
metadata: Dict[str, Any]
validation: List[str]
class MethodBase:
"""Base class for all methods"""
def __init__(self, service):
self.service = service
self.name: str
self.description: str
self.auth_source: AuthSource = AuthSource.LOCAL # Default to local auth
@property
def actions(self) -> Dict[str, Dict[str, Any]]:
"""Available actions and their parameters"""
raise NotImplementedError
async def execute(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> ActionResult:
"""Execute method action with authentication data"""
raise NotImplementedError
async def validate_parameters(self, action: str, parameters: Dict[str, Any]) -> bool:
"""Validate action parameters"""
if action not in self.actions:
return False
action_def = self.actions[action]
required_params = {k for k, v in action_def['parameters'].items() if v['required']}
return all(param in parameters for param in required_params)
async def rollback(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> None:
"""Rollback action if needed"""
pass
class Action(BaseModel):
"""Action model with validation"""
method: str
action: str
parameters: Dict[str, Any]
retryCount: int = 0
retryMax: int
status: ActionStatus = ActionStatus.PENDING
timeout: Optional[int] = None
dependencies: List[str] = []
rollback_on_failure: bool = False
auth_source: Optional[AuthSource] = None # Auth source for this action
class Config:
use_enum_values = True
class AgentTask(BaseModel):
"""Task model with validation"""
id: str
workflowId: str
status: TaskStatus = TaskStatus.PENDING
userInput: str
dataList: List[Dict[str, str]] # List of available connections
actionList: List[Action]
chatHistory: str
taskHistory: str
previousTaskFeedback: Optional[str]
thisTaskFeedback: Optional[str]
result: Optional[Dict[str, Any]]
documentsInput: List[Dict]
documentsOutput: List[Dict]
startedAt: str
finishedAt: Optional[str]
error: Optional[str]
dependencies: List[str] = []
requiredOutputs: List[str] = []
class Config:
use_enum_values = True
def get_auth_data(self, auth_source: AuthSource) -> Optional[Dict[str, Any]]:
"""Get authentication data for the specified source"""
return next(
(conn for conn in self.dataList if conn.get('source') == auth_source),
None
)
def get_action_by_id(self, action_id: str) -> Optional[Action]:
"""Get action by its ID (method:action)"""
return next((a for a in self.actionList if f"{a.method}:{a.action}" == action_id), None)
def can_execute_action(self, action: Action) -> bool:
"""Check if action can be executed based on dependencies and auth"""
# Check dependencies
if action.dependencies:
if not all(
self.get_action_by_id(dep).status == ActionStatus.SUCCESS
for dep in action.dependencies
):
return False
# Check authentication
if action.auth_source and action.auth_source != AuthSource.LOCAL:
if not self.get_auth_data(action.auth_source):
return False
return True
def is_complete(self) -> bool:
"""Check if all actions are complete"""
return all(a.status in [ActionStatus.SUCCESS, ActionStatus.SKIPPED]
for a in self.actionList)
def has_failed(self) -> bool:
"""Check if any action has failed"""
return any(a.status == ActionStatus.FAILED for a in self.actionList)
class ServiceCenter:
"""Service center with improved state management"""
def __init__(self):
self.state = {
'status': TaskStatus.PENDING,
'retryCount': 0,
'retryMax': 3,
'timeout': 300, # 5 minutes
'lastError': None,
'lastErrorTime': None
}
self.methods: Dict[str, MethodBase] = {}
self.tasks: Dict[str, AgentTask] = {}
self.promptManager = AIPromptManager()
self.taskStateManager = TaskStateManager()
self.documentProcessor = DocumentExtraction()
async def execute_task(self, task: AgentTask) -> None:
"""Execute task with improved error handling and timeout"""
try:
# Check for timeout
if (datetime.now(UTC) - datetime.fromisoformat(task.startedAt)).seconds > self.state['timeout']:
task.status = TaskStatus.TIMEOUT
return
# Execute actions
for action in task.actionList:
if not task.can_execute_action(action):
if not task.get_auth_data(action.auth_source):
action.status = ActionStatus.FAILED
task.error = f"Missing authentication for {action.auth_source}"
else:
action.status = ActionStatus.DEPENDENCY_FAILED
continue
try:
# Get method
method = self.methods.get(action.method)
if not method:
raise ValueError(f"Unknown method: {action.method}")
# Validate parameters
if not await method.validate_parameters(action.action, action.parameters):
raise ValueError(f"Invalid parameters for {action.method}:{action.action}")
# Get auth data if needed
auth_data = None
if action.auth_source and action.auth_source != AuthSource.LOCAL:
auth_data = task.get_auth_data(action.auth_source)
if not auth_data:
raise ValueError(f"Missing authentication data for {action.auth_source}")
# Execute with timeout
result = await asyncio.wait_for(
method.execute(action.action, action.parameters, auth_data),
timeout=action.timeout or 60
)
if result.success:
action.status = ActionStatus.SUCCESS
else:
if self._should_retry(result.data.get('error')):
action.retryCount += 1
if action.retryCount > action.retryMax:
action.status = ActionStatus.FAILED
if action.rollback_on_failure:
await method.rollback(action.action, action.parameters, auth_data)
else:
action.status = ActionStatus.RETRY
else:
action.status = ActionStatus.FAILED
if action.rollback_on_failure:
await method.rollback(action.action, action.parameters, auth_data)
except asyncio.TimeoutError:
action.status = ActionStatus.TIMEOUT
except Exception as e:
action.status = ActionStatus.FAILED
if action.rollback_on_failure:
await method.rollback(action.action, action.parameters, auth_data)
# Update task status
if task.has_failed():
task.status = TaskStatus.FAILED
elif task.is_complete():
task.status = TaskStatus.SUCCESS
task.finishedAt = datetime.now(UTC).isoformat()
except Exception as e:
task.status = TaskStatus.FAILED
task.error = str(e)
class AIPromptManager:
"""Manages AI prompts and response validation"""
def generatePrompt(self, context: Dict[str, Any], examples: List[Dict]) -> str:
"""Generate a context-aware prompt with few-shot examples"""
prompt = (
f"Task: {context['task']}\n"
f"Document: {context['document']['name']} ({context['document']['type']})\n"
"Examples:\n"
)
for ex in examples:
prompt += f"- {ex['input']} => {ex['output']}\n"
prompt += "Extract the most relevant information for the task above."
return prompt
def validateResponse(self, response: str, schema: Dict) -> bool:
"""Validate AI response against a schema"""
import jsonschema
try:
jsonschema.validate(instance=response, schema=schema)
return True
except jsonschema.ValidationError:
return False
class TaskStateManager:
"""Manages task state and retry tracking"""
def __init__(self):
self.taskStates = {}
def trackState(self, task: AgentTask):
"""Track task state"""
self.taskStates[task.id] = {
"status": task.status,
"retryState": getattr(task, "retryState", {}),
"history": getattr(task, "history", [])
}
def canRetry(self, task: AgentTask, method: str) -> bool:
"""Check if task can be retried"""
retryState = self.taskStates[task.id].get("retryState", {})
return retryState.get(method, 0) < getattr(task, "retryMax", 3)
class DocumentContext(BaseModel):
"""Model for document context"""
id: str
extractionHistory: List[Dict]
relevantSections: List[str]
processingStatus: Dict[str, str]
class DocumentExtraction:
"""Processes documents with context awareness"""
def process_with_context(self, doc: Dict, context: DocumentContext) -> Dict:
"""Process document with context"""
extracted = {}
for section in context.relevantSections:
extracted[section] = doc.get(section)
return extracted
def track_extraction(self, doc: Dict, extraction: Dict):
"""Track document extraction"""
if 'extractionHistory' not in doc:
doc['extractionHistory'] = []
doc['extractionHistory'].append(extraction)
class ErrorRecovery(BaseModel):
"""Model for error recovery strategies"""
strategy: str # e.g., "retry", "fallback", "skip"
fallbackActions: List[str]
contextPreservation: bool
### 1.3 Method-Based Module Structure
```python
# Example: methodSharepoint.py
class MethodSharepoint:
"""SharePoint method implementation"""
def __init__(self, service):
self.service = service
self.name = "sharepoint"
self.description = "Search and process SharePoint documents"
self.auth_source = AuthSource.MSFT # Requires Microsoft authentication
@property
def actions(self) -> Dict[str, Dict[str, Any]]:
"""Available actions and their parameters"""
return {
"search": {
"description": "Search SharePoint documents",
"retryMax": 3,
"timeout": 30,
"parameters": {
"query": {"type": "string", "required": True},
"site": {"type": "string", "required": False},
"folder": {"type": "string", "required": False},
"maxResults": {"type": "number", "required": False}
}
}
}
async def execute(self, action: str, parameters: Dict[str, Any], auth_data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
"""Execute SharePoint method"""
if not auth_data:
return {"success": False, "error": "Missing Microsoft authentication"}
if action == "search":
return await self._searchDocuments(parameters, auth_data)
return {"success": False, "error": f"Unknown action: {action}"}
async def _searchDocuments(self, parameters: Dict[str, Any], auth_data: Dict[str, Any]) -> Dict[str, Any]:
"""Search SharePoint documents"""
# Implementation using existing SharePoint agent functionality
pass
# Example: methodOutlook.py
class MethodOutlook:
"""Outlook method implementation"""
def __init__(self, service):
self.service = service
self.name = "outlook"
self.description = "Handle Outlook email operations"
@property
def actions(self) -> Dict[str, Dict[str, Any]]:
"""Available actions and their parameters"""
return {
"readMails": {
"description": "Read emails from specified folder",
"retryMax": 2, # Action-specific retry limit
"parameters": {
"folder": {"type": "string", "required": False},
"unreadOnly": {"type": "boolean", "required": False},
"fromAddress": {"type": "string", "required": False},
"maxResults": {"type": "number", "required": False}
}
},
"sendMail": {
"description": "Send an email",
"retryMax": 1, # Action-specific retry limit
"parameters": {
"to": {"type": "array", "items": "string", "required": True},
"subject": {"type": "string", "required": True},
"body": {"type": "string", "required": True},
"attachments": {"type": "array", "items": "FileRef", "required": False}
}
}
}
async def execute(self, action: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
"""Execute Outlook method"""
if action == "readMails":
return await self._readMails(parameters)
elif action == "sendMail":
return await self._sendMail(parameters)
return {"success": False, "error": f"Unknown action: {action}"}
```
### 1.4 Key Data Objects
```python
class ChatWorkflow:
id: str
mandateId: str
status: str
name: Optional[str]
currentRound: int
lastActivity: str
startedAt: str
logs: List[ChatLog]
messages: List[ChatMessage]
stats: Optional[ChatStat]
tasks: List[Task]
class AgentTask:
id: str
workflowId: str
status: str # pending, success, failed, retry
userInput: str # AI-processed summary
dataList: List[Dict[str, str]] # User connections
actionList: List[Dict[str, Any]] # Actions to execute, e.g.:
# [
# {
# "method": "sharepoint",
# "action": "search",
# "parameters": {
# "query": "offerings",
# "site": "valueon"
# },
# "retryCount": 0,
# "retryMax": 3,
# "status": "pending" # pending, success, failed, retry
# },
# {
# "method": "outlook",
# "action": "sendMail",
# "parameters": {
# "to": ["user@example.com"],
# "subject": "Offer Summary",
# "body": "..."
# },
# "retryCount": 0,
# "retryMax": 1,
# "status": "pending"
# }
# ]
chatHistory: str # Summary of previous messages
taskHistory: str # Summary of previous tasks
previousTaskFeedback: Optional[str]
thisTaskFeedback: Optional[str]
result: Optional[ChatMessage]
documentsInput: List[Dict]
documentsOutput: List[Dict]
startedAt: str
finishedAt: Optional[str]
error: Optional[str]
dependencies: List[str] = [] # Task dependencies
requiredOutputs: List[str] = [] # Required outputs from dependencies
```
## 2. Process Flow
### 2.1 Initialization Phase
```mermaid
graph TD
A[User Input] --> B[WorkflowManager.workflowProcess]
B --> C[ChatManager.initialize]
C --> D[Create ServiceCenter]
D --> E[Create Initial Task]
```
1. **WorkflowManager.workflowProcess**
- Receives user input and workflow
- Initializes chat manager
- Starts task processing loop
2. **ChatManager.initialize**
- Creates ServiceCenter with all required components
- Initializes service interfaces
- Sets up task and state management
### 2.2 Task Creation Phase
1. **Create Initial Task**
```python
def createInitialTask(self, userInput: UserInputRequest) -> AgentTask:
# 1. Get available methods and their actions
available_methods = self._getAvailableMethods()
method_catalog = {
method.name: {
"description": method.description,
"actions": method.actions
}
for method in available_methods
}
# 2. Process user input with AI including document analysis
processedInput = await self.service.model['callAiBasic'](
f"""Analyze user request and documents:
User Prompt: {userInput.prompt}
Documents: {userInput.listFileId}
Available Methods:
{json.dumps(method_catalog, indent=2)}
Please provide:
1. Main objective
2. Required actions (using available methods and their actions)
3. Required data sources
4. Document processing requirements
5. Expected output format
Format your response as JSON:
{{
"objective": "string",
"actions": [
{{
"method": "string",
"action": "string",
"parameters": {{
"param1": "value1",
"param2": "value2"
}}
}}
],
"dataSources": ["string"],
"documentRequirements": ["string"],
"outputFormat": "string"
}}
"""
)
# 3. Create task with processed input and initialize action states
actions = []
for action in processedInput['actions']:
method = next(m for m in available_methods if m.name == action['method'])
action_info = method.actions[action['action']]
actions.append({
**action,
"retryCount": 0,
"retryMax": action_info['retryMax'],
"status": "pending"
})
task = AgentTask(
workflowId=self.service.workflow.id,
userInput=processedInput,
dataList=self.service.context['dataConnections'],
actionList=actions,
chatHistory=await self.workflowSummarize(userInput),
startedAt=datetime.now(UTC).isoformat()
)
# 4. Store in service
self.service.tasks['current'] = task
return task
```
### 2.3 Task Execution Phase
1. **Execute Task**
```python
async def executeTask(self, task: AgentTask) -> None:
"""Execute task actions in sequence"""
for action in task.actionList:
if action['status'] == 'pending':
try:
# Get method instance
method = self.service.methods[action['method']]
# Execute action
result = await method.execute(
action['action'],
action['parameters']
)
if result['success']:
action['status'] = 'success'
else:
if self._shouldRetry(result['error']):
action['retryCount'] += 1
if action['retryCount'] > action['retryMax']:
action['status'] = 'failed'
task.status = 'failed'
task.error = "Maximum retries exceeded"
else:
action['status'] = 'retry'
task.status = 'retry'
else:
action['status'] = 'failed'
task.status = 'failed'
task.error = result['error']
except Exception as e:
action['status'] = 'failed'
task.status = 'failed'
task.error = str(e)
# Update task status based on action status
if action['status'] == 'failed':
break
# Mark task as complete if all actions succeeded
if all(a['status'] == 'success' for a in task.actionList):
task.status = 'success'
task.finishedAt = datetime.now(UTC).isoformat()
```
### 2.4 Task Analysis Phase
1. **Define Next Task**
```python
def defineNextTask(self, currentTask: AgentTask) -> Optional[AgentTask]:
try:
# 1. Analyze current task results using basic AI
analysis = await self.service.model['callAiBasic'](
f"""Analyze task results and determine next steps:
Previous Feedback: {currentTask.previousTaskFeedback}
Current Feedback: {currentTask.thisTaskFeedback}
User Input: {currentTask.userInput}
Current Documents: {currentTask.documentsOutput}
Please provide:
1. Task completion status
2. Next required actions
3. Required documents
4. Method recommendations
Format your response as JSON:
{{
"isComplete": boolean,
"nextActions": ["string"],
"requiredDocuments": ["string"],
"recommendedMethods": ["string"]
}}
"""
)
# 2. Parse and validate AI response
analysis_data = json.loads(analysis)
# 3. Determine if next task needed
if not analysis_data["isComplete"]:
# 4. Create next task
nextTask = self._createNextTask(currentTask, analysis_data)
self.service.tasks['previous'] = currentTask
self.service.tasks['current'] = nextTask
return nextTask
return None
except Exception as e:
logger.error(f"Error defining next task: {str(e)}")
return None
```
## 3. Method Integration
### 3.1 Method Registration
```python
def _registerMethods(self):
"""Register available methods in service center"""
self.service.methods = {
"sharepoint": MethodSharepoint(self.service),
"outlook": MethodOutlook(self.service),
"web": MethodWeb(self.service),
"document": MethodDocument(self.service)
}
```
### 3.2 Method Execution
```python
def _executeMethod(self, method: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
"""Execute a method with parameters"""
try:
# Get method implementation
method_impl = self.service.methods.get(method)
if not method_impl:
return {"success": False, "error": f"Unknown method: {method}"}
# Execute method
return await method_impl.execute(parameters)
except Exception as e:
return {"success": False, "error": str(e)}
```
## 4. Error Handling
### 4.1 Error Types
1. **AI Errors**
- Model unavailable
- Invalid response
- Timeout
2. **Method Errors**
- Invalid method
- Execution failure
- Resource unavailable
3. **Task Errors**
- Invalid state
- Missing data
- Timeout
### 4.2 Retry Logic
```python
def _shouldRetry(self, error: str) -> bool:
"""Determine if error is retryable"""
retryable_errors = [
"AI down",
"Document not found",
"Content extraction failed"
]
return any(err in error for err in retryable_errors)
def _shouldCreateNextTask(self, analysis: Dict[str, Any]) -> bool:
"""Determine if next task is needed based on AI analysis"""
return not analysis.get("isComplete", True)
```
## 5. AI Integration Points
### 5.1 User Input Processing
```python
async def _processUserInput(self, input: str, documents: List[str]) -> str:
"""Process user input including document analysis"""
context = {
"task": "Process user input",
"document": {"name": "User Input", "type": "text"}
}
examples = [
{"input": "Search documents", "output": "Extract relevant information"}
]
prompt = self.service.promptManager.generatePrompt(context, examples)
return await self.service.model['callAiBasic'](
f"""Analyze user request and documents:
User Input: {input}
Documents: {documents}
{prompt}
Please provide:
1. Main objective
2. Required actions
3. Required data sources
4. Document processing requirements
5. Expected output format
Format your response as JSON:
{{
"objective": "string",
"actions": ["string"],
"dataSources": ["string"],
"documentRequirements": ["string"],
"outputFormat": "string"
}}
"""
)
```
### 5.2 Task Analysis
```python
async def _analyzeTaskResults(self, task: AgentTask) -> str:
"""Analyze task results and determine next steps"""
context = {
"task": "Analyze task results",
"document": {"name": "Task Results", "type": "json"}
}
examples = [
{"input": "Task completed", "output": "Generate next steps"}
]
prompt = self.service.promptManager.generatePrompt(context, examples)
return await self.service.model['callAiBasic'](
f"""Analyze task results and determine next steps:
Task Input: {task.userInput}
Previous Feedback: {task.previousTaskFeedback}
Current Feedback: {task.thisTaskFeedback}
Current Documents: {task.documentsOutput}
{prompt}
Please provide:
1. Task completion status
2. Next required actions
3. Required documents
4. Method recommendations
Format your response as JSON:
{{
"isComplete": boolean,
"nextActions": ["string"],
"requiredDocuments": ["string"],
"recommendedMethods": ["string"]
}}
"""
)
```
### 5.3 Result Processing
```python
async def _processTaskResults(self, task: AgentTask) -> str:
"""Process task results and generate feedback"""
context = {
"task": "Process task results",
"document": {"name": "Task Results", "type": "json"}
}
examples = [
{"input": "Task results", "output": "Generate summary"}
]
prompt = self.service.promptManager.generatePrompt(context, examples)
return await self.service.model['callAiBasic'](
f"""Process task results and generate feedback:
Task Input: {task.userInput}
Method Results: {task.result}
Generated Documents: {task.documentsOutput}
{prompt}
Please provide:
1. Summary of completed actions
2. Generated document descriptions
3. Next steps or completion status
Format your response as JSON:
{{
"summary": "string",
"documents": ["string"],
"nextSteps": ["string"]
}}
"""
)
```
## 6. File Structure and Implementation Plan
### 6.1 File Structure
```
gateway/
├── modules/
│ ├── workflow/
│ │ ├── managerWorkflow.py # Workflow management and state machine
│ │ ├── managerChat.py # Chat management and AI response validation
│ │ ├── managerPrompt.py # AI prompt generation and management
│ │ ├── methodBase.py # Base method class with result validation
│ │ └── documentExtraction.py # Document content extraction
│ │
│ ├── agents/ # To be refactored into methods
│ │ ├── agentSharepoint.py → methods/methodSharepoint.py
│ │ ├── agentOutlook.py → methods/methodOutlook.py
│ │ ├── agentWebcrawler.py → methods/methodWeb.py
│ │ ├── agentDocument.py → methods/methodDocument.py
│ │ └── agentCoder.py → methods/methodCoder.py
│ │
│ ├── methods/ # New directory for method implementations
│ │ ├── methodSharepoint.py # SharePoint operations
│ │ ├── methodOutlook.py # Outlook operations
│ │ ├── methodWeb.py # Web operations
│ │ ├── methodDocument.py # Document operations
│ │ ├── methodCoder.py # Code generation operations
│ │ └── methodPowerpoint.py # PowerPoint operations
│ │
│ └── interfaces/
│ ├── interfaceChatModel.py # Chat system models and enums
│ └── interfaceAppModel.py # Application models including UserConnection
```
### 6.2 Implementation Plan
#### Phase 1: Core Structure Setup
1. **File Renaming and Organization**
- Rename manager files to follow `manager*.py` pattern
- Move document processor to `documentExtraction.py`
- Create new `methods` directory
2. **Model Updates**
- Update `interfaceChatModel.py` with new enums and models
- Integrate `UserConnection` from `interfaceAppModel.py`
- Update validation logic in respective modules
#### Phase 2: Method Migration
1. **Base Method Implementation**
- Implement `methodBase.py` with core functionality
- Add method result validation
- Set up authentication handling
2. **Agent to Method Conversion**
- Convert each agent to its method implementation
- Migrate functionality while maintaining existing behavior
- Add method-specific validation
3. **New Method Implementation**
- Implement `methodPowerpoint.py`
- Add PowerPoint-specific operations
- Integrate with document processing
#### Phase 3: Manager Updates
1. **Chat Manager Enhancement**
- Integrate AI response validation
- Update service center structure
- Improve error handling
2. **Document Manager Integration**
- Update document operations for new method structure
- Enhance content extraction capabilities
- Improve file handling
3. **Workflow Manager Updates**
- Update state machine for method-based approach
- Improve task management
- Enhance error recovery
#### Phase 4: Testing and Validation
1. **Unit Testing**
- Test each method implementation
- Validate error handling
- Verify authentication flow
2. **Integration Testing**
- Test method interactions
- Validate document processing
- Verify workflow execution
3. **Performance Testing**
- Measure response times
- Validate resource usage
- Test concurrent operations
#### Phase 5: Documentation and Cleanup
1. **Documentation**
- Update API documentation
- Document method implementations
- Add usage examples
2. **Code Cleanup**
- Remove deprecated code
- Clean up old agent files
- Optimize imports
3. **Final Review**
- Code review
- Security audit
- Performance optimization
### 6.3 Migration Strategy
1. **Incremental Migration**
- Migrate one agent at a time
- Maintain backward compatibility
- Use feature flags for gradual rollout
2. **Testing Strategy**
- Unit tests for each method
- Integration tests for workflows
- End-to-end tests for complete scenarios
3. **Rollback Plan**
- Keep old agent implementations until stable
- Maintain version control
- Document rollback procedures
### 6.4 Success Criteria
1. **Functionality**
- All existing features working
- New method-based structure operational
- Improved error handling
2. **Performance**
- Equal or better response times
- Reduced resource usage
- Improved scalability
3. **Maintainability**
- Clear code structure
- Comprehensive documentation
- Easy to extend
4. **Security**
- Proper authentication handling
- Secure data processing
- Access control implementation

View file

@ -1,235 +0,0 @@
# AI Call Functions Test and Content Size Analysis
## Overview
This file documents the ServiceCenter AI functions that have risk of delivering too big content,
along with their usage patterns and potential size issues.
## High-Risk AI Functions
### 1. summarizeChat() -> callAiTextBasic()
**Location**: gateway/modules/chat/handling/promptFactory.py:122
**Risk Level**: MEDIUM
**Content**: Entire workflow message history
**Usage**:
```python
messageSummary = await service.summarizeChat(context.workflow.messages) if context.workflow else ""
```
**Potential Issues**:
- Long conversations can generate very large summaries
- Includes all previous messages in workflow
- No size limits or truncation
### 2. callAiTextAdvanced() -> interfaceAiCalls.callAiTextAdvanced()
**Risk Level**: HIGH
**Multiple Usage Points**:
#### A. Task Planning (handlingTasks.py:116)
```python
prompt = await self.service.callAiTextAdvanced(task_planning_prompt)
```
**Content**: User input + document context + connection context + previous results
**Risk**: VERY HIGH - includes all available documents and context
#### B. Action Definition (handlingTasks.py:388)
```python
prompt = await self.service.callAiTextAdvanced(action_prompt)
```
**Content**: Task context + available documents + connections + previous results
**Risk**: HIGH - comprehensive context for action planning
#### C. Result Review (handlingTasks.py:894)
```python
response = await self.service.callAiTextAdvanced(prompt)
```
**Content**: Action results + success criteria + context
**Risk**: MEDIUM-HIGH - depends on result size
#### D. Email Composition (methodOutlook.py:1609)
```python
composed_email = await self.service.interfaceAiCalls.callAiTextAdvanced(ai_prompt)
```
**Content**: Document content + email requirements
**Risk**: MEDIUM - depends on document size
#### E. AI Processing (methodAi.py:175)
```python
result = await self.service.callAiTextAdvanced(enhanced_prompt, context)
```
**Content**: User prompt + extracted document content
**Risk**: HIGH - includes full document content
### 3. callAiTextBasic() -> interfaceAiCalls.callAiTextBasic()
**Risk Level**: MEDIUM
**Multiple Usage Points**:
#### A. Document Format Conversion (methodDocument.py:429)
```python
formatted_content = await self.service.callAiTextBasic(ai_prompt, content)
```
**Content**: Document content + format requirements
**Risk**: MEDIUM - depends on document size
#### B. HTML Report Generation (methodDocument.py:642)
```python
aiReport = await self.service.callAiTextBasic(aiPrompt, combinedContent)
```
**Content**: Combined content from multiple documents
**Risk**: HIGH - combines multiple documents
#### C. AI Processing Fallback (methodAi.py:177)
```python
result = await self.service.callAiTextBasic(enhanced_prompt, context)
```
**Content**: User prompt + document context
**Risk**: MEDIUM - includes document content
#### D. Document Content Processing (documentExtraction.py:1459)
```python
processedContent = await self._serviceCenter.callAiTextBasic(aiPrompt, contentToProcess)
```
**Content**: Document chunks + AI prompt
**Risk**: MEDIUM - processes document chunks
### 4. extractContentFromDocument() -> documentProcessor.processFileData()
**Risk Level**: HIGH
**Multiple Usage Points**:
#### A. Document Content Extraction (methodDocument.py:74)
```python
extracted_content = await self.service.extractContentFromDocument(
prompt=aiPrompt,
document=chatDocument
)
```
**Content**: Full document + extraction prompt
**Risk**: HIGH - processes entire documents
#### B. HTML Report Generation (methodDocument.py:581)
```python
extracted_content = await self.service.extractContentFromDocument(
prompt="Extract readable text content for HTML report generation",
document=doc
)
```
**Content**: Full document content
**Risk**: HIGH - processes documents for reports
#### C. Email Composition (methodOutlook.py:1510)
```python
extracted_content = await self.service.extractContentFromDocument(
prompt="Extract readable text content for email composition",
document=doc
)
```
**Content**: Full document content
**Risk**: HIGH - processes documents for emails
#### D. AI Processing (methodAi.py:94)
```python
extracted_content = await self.service.extractContentFromDocument(
prompt=extraction_prompt.strip(),
document=doc
)
```
**Content**: Full document content
**Risk**: HIGH - processes documents for AI analysis
## Risk Assessment Summary
### CRITICAL RISK (Immediate Attention Required)
1. **Task Planning** (handlingTasks.py:116) - Entire workflow context
2. **Action Definition** (handlingTasks.py:388) - Comprehensive context
3. **Document Processing** (all extractContentFromDocument calls) - Full documents
4. **AI Method Processing** (methodAi.py:175) - Document content + context
5. **Report Generation** (methodDocument.py:642) - Multiple documents combined
### HIGH RISK (Monitor Closely)
1. **Chat Summarization** (promptFactory.py:122) - Message history
2. **Document Format Conversion** (methodDocument.py:429) - Single documents
3. **Email Composition** (methodOutlook.py:1609) - Document content
## Potential Issues
### Content Size Problems
- Large documents (PDFs, Word docs, Excel files) can exceed AI model limits
- Combined document content in reports can be massive
- Long conversation histories in chat summarization
- Full workflow context in task planning
### Performance Issues
- Timeout errors for large content
- Memory issues with large document processing
- API rate limiting with large requests
- Cost implications for large AI calls
### Error Scenarios
- OpenAI API 400 errors (content too large)
- Timeout errors (processing too slow)
- Memory exhaustion (large document processing)
- Incomplete processing (truncated content)
## Recommended Solutions
### 1. Content Size Limits
- Implement maximum content size checks before AI calls
- Truncate large content with appropriate warnings
- Split large documents into chunks
### 2. Content Filtering
- Remove unnecessary context from prompts
- Filter out large binary content
- Use document summaries instead of full content
### 3. Chunking Strategy
- Process large documents in smaller chunks
- Implement progressive processing
- Use streaming for large responses
### 4. Caching and Optimization
- Cache processed document content
- Reuse extracted content across operations
- Implement smart content selection
### 5. Error Handling
- Graceful degradation for oversized content
- Fallback strategies for failed AI calls
- User notifications for content size issues
## Test Scenarios
### Test Case 1: Large Document Processing
- Upload a 10MB PDF document
- Try to extract content for AI processing
- Monitor for size limit errors
### Test Case 2: Multiple Document Reports
- Upload 5+ large documents
- Generate HTML report
- Check for combined content size issues
### Test Case 3: Long Conversation History
- Create workflow with 50+ messages
- Test chat summarization
- Monitor for context size limits
### Test Case 4: Task Planning with Large Context
- Create workflow with many documents
- Test task planning functionality
- Check for prompt size limits
## Monitoring Recommendations
1. **Log Content Sizes**: Track the size of content sent to AI functions
2. **Monitor API Errors**: Watch for 400 errors indicating content too large
3. **Performance Metrics**: Track processing times for large content
4. **User Feedback**: Monitor for incomplete or failed operations
5. **Cost Tracking**: Monitor AI API costs for large requests
## Implementation Priority
1. **Immediate**: Add content size checks to extractContentFromDocument
2. **High**: Implement chunking for large document processing
3. **Medium**: Add content filtering to task planning prompts
4. **Low**: Implement caching for processed content
This analysis should help identify and mitigate the risks of delivering too big content to AI functions.

View file

@ -1,103 +0,0 @@
#!/usr/bin/env python3
"""
Test script to verify AI fallback mechanism from Basic to Advanced when context length is exceeded.
"""
import asyncio
import logging
from modules.interfaces.interfaceAiCalls import AiCalls
from modules.connectors.connectorAiOpenai import ContextLengthExceededException
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
async def test_context_length_fallback():
"""Test the fallback mechanism when context length is exceeded"""
# Create AI calls instance
ai_calls = AiCalls()
# Create a very large context that would exceed OpenAI's context limit
large_context = "This is a test context. " * 10000 # Create a large context
prompt = "Please summarize this context in one sentence."
logger.info("Testing AI Basic with large context (should trigger fallback)...")
try:
# This should trigger the context length exceeded error and fallback to Advanced
result = await ai_calls.callAiTextBasic(prompt, large_context)
logger.info(f"✅ Fallback successful! Result: {result[:100]}...")
return True
except Exception as e:
logger.error(f"❌ Test failed: {str(e)}")
return False
async def test_direct_context_length_exception():
"""Test that the ContextLengthExceededException is properly raised"""
from modules.connectors.connectorAiOpenai import AiOpenai
logger.info("Testing direct ContextLengthExceededException...")
try:
# Create OpenAI connector
openai_connector = AiOpenai()
# Create messages that would exceed context length
large_messages = [
{"role": "user", "content": "Test message. " * 50000} # Very large message
]
# This should raise ContextLengthExceededException
await openai_connector.callAiBasic(large_messages)
logger.error("❌ Expected ContextLengthExceededException but none was raised")
return False
except ContextLengthExceededException as e:
logger.info(f"✅ ContextLengthExceededException properly raised: {str(e)}")
return True
except Exception as e:
logger.error(f"❌ Unexpected exception: {str(e)}")
return False
async def main():
"""Run all tests"""
logger.info("Starting AI fallback mechanism tests...")
tests = [
("Context Length Fallback", test_context_length_fallback),
("Direct Exception Test", test_direct_context_length_exception),
]
results = []
for test_name, test_func in tests:
logger.info(f"\n--- Running {test_name} ---")
try:
result = await test_func()
results.append((test_name, result))
except Exception as e:
logger.error(f"Test {test_name} crashed: {str(e)}")
results.append((test_name, False))
# Summary
logger.info("\n" + "="*50)
logger.info("TEST SUMMARY")
logger.info("="*50)
passed = 0
for test_name, result in results:
status = "✅ PASSED" if result else "❌ FAILED"
logger.info(f"{test_name}: {status}")
if result:
passed += 1
logger.info(f"\nTotal: {passed}/{len(results)} tests passed")
if passed == len(results):
logger.info("🎉 All tests passed! Fallback mechanism is working correctly.")
else:
logger.warning("⚠️ Some tests failed. Please check the implementation.")
if __name__ == "__main__":
asyncio.run(main())

View file

@ -1,855 +0,0 @@
#!/usr/bin/env python3
"""
Test script for DocumentExtraction class.
Processes all files in d:/temp folder and stores extracted content in d:/temp/extracted.
Features:
- Option to extract content WITH AI processing (default)
- Option to extract content WITHOUT AI processing (content-only mode)
- Supports all document types: text, images, PDFs, Office documents, etc.
- Detailed logging and progress tracking
- Separate output directories for AI vs content-only modes
Usage:
- Interactive mode: python test_documentExtraction.py
- Content-only mode: python test_documentExtraction.py --no-ai
- Content-only mode: python test_documentExtraction.py --content-only
- Specify custom input/output: python test_documentExtraction.py --input-dir /path/to/input --output-dir /path/to/output --no-ai
"""
import os
import asyncio
import logging
import sys
import argparse
from pathlib import Path
from typing import List, Optional
from datetime import datetime, UTC
# Configure logging
logging.basicConfig(
level=logging.DEBUG, # Changed from INFO to DEBUG
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Filter out specific unwanted log messages
class LogFilter(logging.Filter):
"""Filter to hide specific unwanted log messages."""
def filter(self, record):
# Hide workflow stats update errors
if "Workflow" in record.getMessage() and "not found for stats update" in record.getMessage():
return False
# Hide HTTP request info messages
if "HTTP Request:" in record.getMessage() and "POST https://api.openai.com" in record.getMessage():
return False
# Hide HTTP response info messages
if "HTTP/1.1 200 OK" in record.getMessage():
return False
return True
# Apply the filter to the root logger
root_logger = logging.getLogger()
root_logger.addFilter(LogFilter())
def check_dependencies():
"""Check if required dependencies are available and provide installation instructions."""
missing_deps = []
# Check for required dependencies
try:
import bs4
logger.info("beautifulsoup4 is available")
except ImportError:
missing_deps.append("beautifulsoup4")
logger.error("beautifulsoup4 is missing")
try:
import PyPDF2
logger.info("PyPDF2 is available")
except ImportError:
missing_deps.append("PyPDF2")
logger.error("PyPDF2 is missing")
try:
import fitz
logger.info("PyMuPDF (fitz) is available")
except ImportError:
missing_deps.append("PyMuPDF")
logger.error("PyMuPDF (fitz) is missing")
try:
import docx
logger.info("python-docx is available")
except ImportError:
missing_deps.append("python-docx")
logger.error("python-docx is missing")
try:
import openpyxl
logger.info("openpyxl is available")
except ImportError:
missing_deps.append("openpyxl")
logger.error("openpyxl is missing")
try:
import pptx
logger.info("python-pptx is available")
except ImportError:
missing_deps.append("python-pptx")
logger.error("python-pptx is missing")
try:
from PIL import Image
logger.info("Pillow (PIL) is available")
except ImportError:
missing_deps.append("Pillow")
logger.error("Pillow (PIL) is missing")
if missing_deps:
logger.error("\n" + "="*60)
logger.error("MISSING DEPENDENCIES DETECTED!")
logger.error("="*60)
logger.error("The following packages are required but not installed:")
for dep in missing_deps:
logger.error(f" - {dep}")
logger.error("\nTo install all dependencies, run:")
logger.error("pip install -r requirements.txt")
logger.error("\nOr install individual packages:")
for dep in missing_deps:
if dep == "beautifulsoup4":
logger.error(f" pip install {dep}")
elif dep == "PyMuPDF":
logger.error(f" pip install {dep}")
elif dep == "Pillow":
logger.error(f" pip install {dep}")
else:
logger.error(f" pip install {dep}")
logger.error("="*60)
return False
logger.info("All required dependencies are available!")
return True
def check_module_imports():
"""Check if we can import the required modules."""
try:
# Add the gateway directory to the path so we can import our modules
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
from modules.chat.documents.documentExtraction import DocumentExtraction
from modules.chat.serviceCenter import ServiceCenter
from modules.interfaces.interfaceAppModel import User, UserConnection
from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem
logger.info("All required modules imported successfully")
return True
except ImportError as e:
logger.error(f"Failed to import required modules: {e}")
logger.error("Make sure you're running this script from the gateway directory")
return False
except Exception as e:
logger.error(f"Unexpected error importing modules: {e}")
return False
def create_mock_service_center():
"""Create a proper ServiceCenter for testing purposes with all required fields."""
try:
from modules.chat.serviceCenter import ServiceCenter
from modules.interfaces.interfaceAppModel import User, UserPrivilege, AuthAuthority
from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem, TaskStatus
from modules.interfaces.interfaceChatModel import ChatLog, ChatMessage, ChatStat
# Create proper user with all required fields
mock_user = User(
id="test_user_001",
username="testuser",
email="test@example.com",
fullName="Test User",
language="en",
enabled=True,
privilege=UserPrivilege.USER,
authenticationAuthority=AuthAuthority.LOCAL,
mandateId="test_mandate_001"
)
# Create proper workflow with all required fields
current_time = datetime.now(UTC).isoformat()
mock_workflow = ChatWorkflow(
id="test_workflow_001",
mandateId="test_mandate_001",
status="active",
name="Test Document Extraction Workflow",
currentRound=1,
lastActivity=current_time,
startedAt=current_time,
logs=[],
messages=[],
stats=None,
tasks=[]
)
# Create service center
service_center = ServiceCenter(mock_user, mock_workflow)
logger.info("ServiceCenter created successfully with proper objects")
return service_center
except Exception as e:
logger.error(f"Failed to create ServiceCenter: {e}")
return None
class DocumentExtractionTester:
"""Test class for DocumentExtraction functionality."""
def __init__(self, input_dir: str = "d:/temp/test-extraction", output_dir: str = None, enable_ai: bool = True):
"""
Initialize the tester.
Args:
input_dir: Directory containing files to process
output_dir: Directory to store extracted content (auto-generated if None)
enable_ai: Whether to enable AI processing (default: True)
"""
self.input_dir = Path(input_dir)
# Auto-generate output directory if not specified
if output_dir is None:
if enable_ai:
self.output_dir = Path(input_dir) / "extracted"
else:
self.output_dir = Path(input_dir) / "extracted-raw"
else:
self.output_dir = Path(output_dir)
self.extractor = None
self.service_center = None
self.enable_ai = enable_ai
if enable_ai:
self.prompt = "Make a summary of each sentence for each page or chapter of the document"
else:
self.prompt = None # No prompt needed for content-only extraction
# Track processing results for summary
self.processing_results = []
# Ensure output directory exists
logger.info(f"Creating output directory: {self.output_dir}")
self.output_dir.mkdir(parents=True, exist_ok=True)
# Verify directory was created
if self.output_dir.exists():
logger.info(f"Output directory created/verified: {self.output_dir}")
logger.info(f"Output directory absolute path: {self.output_dir.absolute()}")
else:
logger.error(f"Failed to create output directory: {self.output_dir}")
# Log configuration
logger.info(f"Configuration: AI processing = {'ENABLED' if self.enable_ai else 'DISABLED'}")
logger.info(f"Input directory: {self.input_dir}")
logger.info(f"Output directory: {self.output_dir}")
# Test basic file writing capability
test_file = self.output_dir / "test_write_capability.txt"
try:
logger.info(f"Testing file write capability to: {test_file}")
logger.info(f"Absolute path: {test_file.absolute()}")
with open(test_file, 'w', encoding='utf-8') as f:
f.write("Test file to verify write capability")
if test_file.exists():
actual_size = test_file.stat().st_size
logger.info(f"Basic file writing test passed: {test_file} (size: {actual_size} bytes)")
# Test reading the file back
with open(test_file, 'r', encoding='utf-8') as f:
content = f.read()
logger.info(f"File read test passed: content length = {len(content)}")
# Clean up test file
test_file.unlink()
logger.info("Test file cleaned up")
else:
logger.error(f"Basic file writing test failed: {test_file}")
except Exception as e:
logger.error(f"Basic file writing test failed with error: {e}")
import traceback
traceback.print_exc()
# Supported file extensions for content extraction
self.supported_extensions = {
# Text and data files
'.txt', '.csv', '.json', '.xml', '.html', '.htm', '.svg',
'.md', '.markdown', '.rst', '.log', '.ini', '.cfg', '.conf',
# Programming languages
'.js', '.ts', '.jsx', '.tsx', '.py', '.java', '.c', '.cpp', '.cc', '.cxx',
'.h', '.hpp', '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
'.r', '.m', '.pl', '.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
'.vbs', '.lua', '.sql', '.r', '.dart', '.elm', '.clj', '.hs', '.fs', '.ml',
# Web technologies
'.css', '.scss', '.sass', '.less', '.vue', '.svelte', '.astro',
# Configuration and build files
'.yaml', '.yml', '.toml', '.env', '.gitignore', '.dockerfile', '.dockerignore',
'.makefile', '.cmake', '.gradle', '.maven', '.pom', '.sln', '.vcxproj',
'.csproj', '.fsproj', '.vbproj', '.xcodeproj', '.pbxproj',
# Documentation and markup
'.tex', '.bib', '.adoc', '.asciidoc', '.wiki', '.creole',
# Images
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.ico',
# Documents
'.pdf', '.docx', '.xlsx', '.pptx', '.odt', '.ods', '.odp',
# Legacy Office formats
'.doc', '.xls', '.ppt',
# Archives and binaries
'.zip', '.tar', '.gz', '.7z', '.rar', '.exe', '.dll', '.so', '.dylib'
}
def initialize_extractor(self):
"""Initialize the DocumentExtraction instance with a proper ServiceCenter."""
try:
# First create the service center
self.service_center = create_mock_service_center()
if not self.service_center:
logger.error("Failed to create ServiceCenter!")
return False
# Now create DocumentExtraction with the service center
from modules.chat.documents.documentExtraction import DocumentExtraction
self.extractor = DocumentExtraction(self.service_center)
logger.info("DocumentExtraction initialized successfully with ServiceCenter")
return True
except Exception as e:
logger.error(f"Failed to initialize DocumentExtraction: {e}")
return False
def get_files_to_process(self) -> List[Path]:
"""Get list of files to process from input directory."""
if not self.input_dir.exists():
logger.error(f"Input directory {self.input_dir} does not exist!")
logger.info("Creating input directory and adding a test file...")
self.input_dir.mkdir(parents=True, exist_ok=True)
# Create a test file if none exist
test_file = self.input_dir / "test.txt"
with open(test_file, 'w') as f:
f.write("This is a test file for document extraction.\nIt contains multiple lines.\nAnd some special characters: äöüß")
logger.info(f"Created test file: {test_file}")
files = []
all_files = list(self.input_dir.iterdir())
logger.info(f"All files in directory: {[f.name for f in all_files]}")
for file_path in all_files:
if file_path.is_file():
logger.debug(f"Checking file: {file_path.name} (extension: {file_path.suffix})")
if file_path.suffix.lower() in self.supported_extensions:
files.append(file_path)
logger.debug(f"Added file: {file_path.name}")
else:
logger.debug(f"Skipped file: {file_path.name} (unsupported extension)")
logger.info(f"Found {len(files)} supported files to process")
if files:
logger.info(f"Files to process: {[f.name for f in files]}")
return files
async def process_single_file(self, file_path: Path) -> bool:
"""
Process a single file and extract its content.
Args:
file_path: Path to the file to process
Returns:
True if successful, False otherwise
"""
if not self.extractor:
logger.error("DocumentExtraction not initialized!")
return False
try:
logger.info(f"Processing file: {file_path.name}")
# Read file data
with open(file_path, 'rb') as f:
file_data = f.read()
logger.debug(f"File size: {len(file_data)} bytes")
# Determine MIME type based on extension
mime_type = self._get_mime_type(file_path.suffix)
logger.debug(f"MIME type: {mime_type}")
# Process the file with or without AI based on configuration
extracted_content = await self.extractor.processFileData(
fileData=file_data,
fileName=file_path.name,
mimeType=mime_type,
base64Encoded=False,
prompt=self.prompt,
enableAI=self.enable_ai
)
logger.debug(f"Extracted {len(extracted_content.contents)} content items")
# Debug: Show content details
for i, content_item in enumerate(extracted_content.contents):
logger.debug(f"Content item {i+1}: label='{content_item.label}', has_data={content_item.data is not None}, data_length={len(content_item.data) if content_item.data else 0}")
# Special logging for JavaScript files
if mime_type == "application/javascript":
logger.debug(f"JavaScript file detected: {file_path.name}")
logger.debug(f"Original file size: {len(file_data)} bytes")
for i, content_item in enumerate(extracted_content.contents):
if content_item.data:
content_size = len(content_item.data.encode('utf-8'))
logger.debug(f"JavaScript content item {i+1}: {content_size} bytes")
# Check if content was truncated
if content_size < len(file_data) * 0.9: # If less than 90% of original
logger.warning(f"JavaScript content may be truncated: {content_size} bytes vs {len(file_data)} bytes original")
# Track processing result
result = {
'fileName': file_path.name,
'status': 'OK',
'content_items': 0,
'output_files': [],
'total_content_size': 0
}
# Save each content item as a separate file
if extracted_content.contents:
for i, content_item in enumerate(extracted_content.contents):
if content_item.data:
content_size = len(content_item.data.encode('utf-8'))
result['total_content_size'] += content_size
logger.debug(f"Content item {i+1}: {content_item.label}, size: {content_size} bytes")
# Generate fileName with new naming convention
if len(extracted_content.contents) == 1:
# Single content item
output_fileName = f"{file_path.stem} - {content_item.label} 1.txt"
else:
# Multiple content items - add sequence number
output_fileName = f"{file_path.stem} - {content_item.label} {i+1}.txt"
output_file = self.output_dir / output_fileName
# Write only the raw extracted content
logger.debug(f"Attempting to write to: {output_file}")
try:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(content_item.data)
# Verify file was created
if output_file.exists():
actual_size = output_file.stat().st_size
logger.info(f"File created successfully: {output_fileName} (expected: {content_size} bytes, actual: {actual_size} bytes)")
else:
logger.error(f"File was not created: {output_file}")
result['output_files'].append(output_fileName)
result['content_items'] += 1
except Exception as write_error:
logger.error(f"Error writing file {output_fileName}: {write_error}")
import traceback
traceback.print_exc()
else:
logger.warning(f"Content item {i+1} has no data, skipping")
else:
logger.warning(f"No content extracted from {file_path.name}")
result['status'] = 'FAIL'
result['error'] = 'No content extracted'
# Add result to tracking list
self.processing_results.append(result)
logger.info(f"Successfully processed {file_path.name} - Total content: {result['total_content_size']} bytes")
return True
except Exception as e:
error_msg = str(e)
logger.error(f"Error processing {file_path.name}: {error_msg}")
# Track failed result
result = {
'fileName': file_path.name,
'status': 'FAIL',
'content_items': 0,
'output_files': [],
'error': error_msg,
'total_content_size': 0
}
self.processing_results.append(result)
return False
def _get_mime_type(self, extension: str) -> str:
"""Get MIME type based on file extension."""
mime_types = {
# Text and data files
'.txt': 'text/plain',
'.csv': 'text/csv',
'.json': 'application/json',
'.xml': 'application/xml',
'.html': 'text/html',
'.htm': 'text/html',
'.svg': 'image/svg+xml',
'.md': 'text/markdown',
'.markdown': 'text/markdown',
'.rst': 'text/x-rst',
'.log': 'text/plain',
'.ini': 'text/plain',
'.cfg': 'text/plain',
'.conf': 'text/plain',
# Programming languages
'.js': 'application/javascript',
'.ts': 'application/typescript',
'.jsx': 'text/jsx',
'.tsx': 'text/tsx',
'.py': 'text/x-python',
'.java': 'text/x-java-source',
'.c': 'text/x-c',
'.cpp': 'text/x-c++src',
'.cc': 'text/x-c++src',
'.cxx': 'text/x-c++src',
'.h': 'text/x-c',
'.hpp': 'text/x-c++hdr',
'.cs': 'text/x-csharp',
'.php': 'application/x-httpd-php',
'.rb': 'text/x-ruby',
'.go': 'text/x-go',
'.rs': 'text/x-rust',
'.swift': 'text/x-swift',
'.kt': 'text/x-kotlin',
'.scala': 'text/x-scala',
'.r': 'text/x-r',
'.m': 'text/x-matlab',
'.pl': 'text/x-perl',
'.sh': 'application/x-sh',
'.bash': 'application/x-sh',
'.zsh': 'application/x-sh',
'.fish': 'application/x-sh',
'.ps1': 'application/x-powershell',
'.bat': 'application/x-msdos-program',
'.cmd': 'application/x-msdos-program',
'.vbs': 'text/vbscript',
'.lua': 'text/x-lua',
'.sql': 'application/sql',
'.dart': 'application/dart',
'.elm': 'text/x-elm',
'.clj': 'text/x-clojure',
'.hs': 'text/x-haskell',
'.fs': 'text/x-fsharp',
'.ml': 'text/x-ocaml',
# Web technologies
'.css': 'text/css',
'.scss': 'text/x-scss',
'.sass': 'text/x-sass',
'.less': 'text/x-less',
'.vue': 'text/x-vue',
'.svelte': 'text/x-svelte',
'.astro': 'text/x-astro',
# Configuration and build files
'.yaml': 'application/x-yaml',
'.yml': 'application/x-yaml',
'.toml': 'application/toml',
'.env': 'text/plain',
'.gitignore': 'text/plain',
'.dockerfile': 'text/x-dockerfile',
'.dockerignore': 'text/plain',
'.makefile': 'text/x-makefile',
'.cmake': 'text/x-cmake',
'.gradle': 'text/x-gradle',
'.maven': 'text/x-maven',
'.pom': 'application/xml',
'.sln': 'text/plain',
'.vcxproj': 'application/xml',
'.csproj': 'application/xml',
'.fsproj': 'application/xml',
'.vbproj': 'application/xml',
'.xcodeproj': 'text/plain',
'.pbxproj': 'text/plain',
# Documentation and markup
'.tex': 'application/x-tex',
'.bib': 'text/x-bibtex',
'.adoc': 'text/asciidoc',
'.asciidoc': 'text/asciidoc',
'.wiki': 'text/x-wiki',
'.creole': 'text/x-wiki',
# Images
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.webp': 'image/webp',
'.bmp': 'image/bmp',
'.tiff': 'image/tiff',
'.ico': 'image/x-icon',
# Documents
'.pdf': 'application/pdf',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.odt': 'application/vnd.oasis.opendocument.text',
'.ods': 'application/vnd.oasis.opendocument.spreadsheet',
'.odp': 'application/vnd.oasis.opendocument.presentation',
# Legacy Office formats
'.doc': 'application/msword',
'.xls': 'application/vnd.ms-excel',
'.ppt': 'application/vnd.ms-powerpoint',
# Archives and binaries (will be processed as binary)
'.zip': 'application/zip',
'.tar': 'application/x-tar',
'.gz': 'application/gzip',
'.7z': 'application/x-7z-compressed',
'.rar': 'application/vnd.rar',
'.exe': 'application/x-msdownload',
'.dll': 'application/x-msdownload',
'.so': 'application/x-sharedlib',
'.dylib': 'application/x-mach-binary'
}
return mime_types.get(extension.lower(), 'application/octet-stream')
async def run_tests(self) -> None:
"""Run the document extraction tests on all files."""
mode = "WITH AI" if self.enable_ai else "CONTENT ONLY (No AI)"
logger.info(f"Starting document extraction tests - {mode}")
logger.info(f"Input directory: {self.input_dir}")
logger.info(f"Output directory: {self.output_dir}")
if self.enable_ai:
logger.info(f"Processing prompt: {self.prompt}")
else:
logger.info("AI processing: DISABLED - Raw content extraction only")
# Initialize the extractor
if not self.initialize_extractor():
logger.error("Cannot proceed without DocumentExtraction!")
return
# Get files to process
files = self.get_files_to_process()
if not files:
logger.warning("No files found to process!")
return
# Process each file
successful = 0
failed = 0
logger.info(f"Starting to process {len(files)} files...")
for i, file_path in enumerate(files):
logger.info(f"Processing file {i+1}/{len(files)}: {file_path.name}")
try:
if await self.process_single_file(file_path):
successful += 1
logger.info(f"File {i+1} processed successfully")
else:
failed += 1
logger.error(f"File {i+1} processing failed")
except Exception as e:
failed += 1
logger.error(f"Exception processing file {i+1}: {e}")
import traceback
traceback.print_exc()
# Print detailed summary
mode = "WITH AI" if self.enable_ai else "CONTENT ONLY (No AI)"
logger.info("\n" + "=" * 80)
logger.info(f"DETAILED TEST SUMMARY - {mode}")
logger.info("=" * 80)
logger.info(f"Total files processed: {len(files)}")
logger.info(f"Successful: {successful}")
logger.info(f"Failed: {failed}")
logger.info(f"Output directory: {self.output_dir}")
if self.enable_ai:
logger.info("AI processing: ENABLED")
else:
logger.info("AI processing: DISABLED")
logger.info("=" * 80)
# List all processed documents with results
logger.info("\nPROCESSING RESULTS:")
logger.info("-" * 80)
for result in self.processing_results:
status_icon = "" if result['status'] == 'OK' else ""
logger.info(f"{status_icon} {result['fileName']} - {result['status']}")
if result['status'] == 'OK':
if result['content_items'] == 1:
logger.info(f" └─ Generated: {result['output_files'][0]} ({result['total_content_size']} bytes)")
else:
logger.info(f" └─ Generated {result['content_items']} files ({result['total_content_size']} total bytes):")
for output_file in result['output_files']:
logger.info(f" └─ {output_file}")
else:
error_msg = result.get('error', 'Unknown error')
logger.info(f" └─ Error: {error_msg}")
logger.info("-" * 80)
logger.info("=" * 80)
def parse_arguments():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description='Document Extraction Test Script')
parser.add_argument('--no-ai', '--content-only', action='store_true',
help='Run in content-only mode without AI processing')
parser.add_argument('--input-dir', type=str, default='d:/temp/test-extraction',
help='Input directory containing files to process (default: d:/temp/test-extraction)')
parser.add_argument('--output-dir', type=str,
help='Output directory for extracted content (auto-generated if not specified)')
parser.add_argument('--verbose', '-v', action='store_true',
help='Enable verbose logging')
return parser.parse_args()
async def main():
"""Main function to run the tests."""
# Parse command line arguments
args = parse_arguments()
# Set logging level based on verbosity
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.getLogger().setLevel(logging.INFO)
logger.info("DocumentExtraction Test Script")
logger.info("=" * 50)
logger.info(f"Source: {args.input_dir}")
# Determine output directory
if args.output_dir:
output_dir = args.output_dir
else:
if args.no_ai:
output_dir = f"{args.input_dir}/extracted-raw"
else:
output_dir = f"{args.input_dir}/extracted"
logger.info(f"Output: {output_dir}")
logger.info("=" * 50)
# Check dependencies first
if not check_dependencies():
logger.error("Please install missing dependencies before running tests.")
return
# Check module imports
if not check_module_imports():
logger.error("Cannot import required modules. Please check your setup.")
return
# Determine mode based on command line arguments
if args.no_ai:
enable_ai = False
logger.info("Running in CONTENT ONLY mode (no AI processing)")
else:
# Interactive mode: ask user for choice
print("\n" + "=" * 50)
print("SELECT EXTRACTION MODE:")
print("=" * 50)
print("1. With AI processing (default)")
print("2. Content only (no AI processing)")
print("=" * 50)
try:
choice = input("Enter your choice (1 or 2, default is 1): ").strip()
if choice == "2":
enable_ai = False
output_dir = f"{args.input_dir}/extracted-raw"
logger.info("Selected: Content only mode (no AI processing)")
else:
enable_ai = True
output_dir = f"{args.input_dir}/extracted"
logger.info("Selected: AI processing mode")
except (EOFError, KeyboardInterrupt):
# Default to AI mode if input fails
enable_ai = True
output_dir = f"{args.input_dir}/extracted"
logger.info("Defaulting to AI processing mode")
# Run tests with selected mode
tester = DocumentExtractionTester(
input_dir=args.input_dir,
output_dir=output_dir,
enable_ai=enable_ai
)
await tester.run_tests()
if __name__ == "__main__":
# Check if command line arguments are provided for automated testing
if len(sys.argv) > 1:
# Parse arguments and run directly
asyncio.run(main())
else:
# Interactive mode: ask user for choice
asyncio.run(main())
# Convenience function for easy content-only extraction
async def extract_documents_content_only(input_folder: str, output_folder: str = None):
"""
Convenience function to extract documents without AI processing.
Args:
input_folder: Path to folder containing documents to extract
output_folder: Path to folder where extracted content will be stored (optional)
Example:
# Extract from d:/temp to d:/temp/extracted-raw
asyncio.run(extract_documents_content_only("d:/temp"))
# Extract from custom folders
asyncio.run(extract_documents_content_only("c:/my_docs", "c:/my_docs/extracted"))
"""
if output_folder is None:
output_folder = f"{input_folder}/extracted-raw"
logger.info(f"Running content-only extraction from {input_folder} to {output_folder}")
# Check dependencies and imports
if not check_dependencies():
logger.error("Missing dependencies. Please install required packages.")
return False
if not check_module_imports():
logger.error("Cannot import required modules. Please check your setup.")
return False
# Create tester and run
tester = DocumentExtractionTester(
input_dir=input_folder,
output_dir=output_folder,
enable_ai=False
)
await tester.run_tests()
return True
# Example usage (uncomment to use):
# if __name__ == "__main__":
# # For content-only extraction from d:/temp to d:/temp/extracted-raw
# asyncio.run(extract_documents_content_only("d:/temp"))

View file

@ -1,189 +0,0 @@
#!/usr/bin/env python3
"""
Simple test script for enhanced Excel processing functionality.
This script tests the DocumentExtraction class with Excel files.
"""
import os
import sys
import asyncio
import logging
from pathlib import Path
# Configure logging
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Add the gateway directory to the path
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
async def test_excel_processing():
"""Test Excel processing functionality."""
try:
# Import required modules
from modules.chat.documents.documentExtraction import DocumentExtraction
from modules.chat.serviceCenter import ServiceCenter
from modules.interfaces.interfaceAppModel import User, UserPrivilege, AuthAuthority
from modules.interfaces.interfaceChatModel import ChatWorkflow
from datetime import datetime, UTC
logger.info("Testing Excel processing functionality...")
# Create mock service center
mock_user = User(
id="test_user_001",
username="testuser",
email="test@example.com",
fullName="Test User",
language="en",
enabled=True,
privilege=UserPrivilege.USER,
authenticationAuthority=AuthAuthority.LOCAL,
mandateId="test_mandate_001"
)
current_time = datetime.now(UTC).isoformat()
mock_workflow = ChatWorkflow(
id="test_workflow_001",
mandateId="test_mandate_001",
status="active",
name="Test Excel Processing Workflow",
currentRound=1,
lastActivity=current_time,
startedAt=current_time,
logs=[],
messages=[],
stats=None,
tasks=[]
)
service_center = ServiceCenter(mock_user, mock_workflow)
logger.info("ServiceCenter created successfully")
# Create DocumentExtraction instance
extractor = DocumentExtraction(service_center)
logger.info("DocumentExtraction created successfully")
# Test with a sample Excel file if available
test_file_path = "d:/temp/test-extraction/test.xlsx"
if os.path.exists(test_file_path):
logger.info(f"Found test file: {test_file_path}")
# Read the file
with open(test_file_path, 'rb') as f:
file_data = f.read()
logger.info(f"File size: {len(file_data)} bytes")
# Process the Excel file
logger.info("Processing Excel file...")
result = await extractor.processFileData(
fileData=file_data,
fileName="test.xlsx",
mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
base64Encoded=False,
prompt=None,
enableAI=False
)
logger.info(f"Excel processing completed successfully!")
logger.info(f"Generated {len(result.contents)} content items:")
for i, content_item in enumerate(result.contents):
logger.info(f" Item {i+1}: {content_item.label}")
logger.info(f" MIME type: {content_item.metadata.mimeType}")
logger.info(f" Size: {content_item.metadata.size} bytes")
if content_item.data:
logger.info(f" Data preview: {content_item.data[:100]}...")
else:
logger.info(f" Data: None")
else:
logger.info("No test Excel file found. Creating a simple test...")
# Test the openpyxl library directly
try:
import openpyxl
from openpyxl import Workbook
# Create a test workbook
wb = Workbook()
ws = wb.active
ws.title = "Test Sheet"
# Add some test data
ws['A1'] = "Name"
ws['B1'] = "Age"
ws['C1'] = "City"
ws['A2'] = "John Doe"
ws['B2'] = 30
ws['C2'] = "New York"
ws['A3'] = "Jane Smith"
ws['B3'] = 25
ws['C3'] = "Los Angeles"
# Test properties
wb.properties.title = "Test Workbook"
wb.properties.creator = "Test User"
wb.properties.subject = "Test Subject"
logger.info("Test workbook created successfully")
logger.info(f" Title: {wb.properties.title}")
logger.info(f" Creator: {wb.properties.creator}")
logger.info(f" Subject: {wb.properties.subject}")
logger.info(f" Sheets: {wb.sheetnames}")
# Test the DocumentExtraction with this workbook
from io import BytesIO
# Save to bytes
buffer = BytesIO()
wb.save(buffer)
buffer.seek(0)
file_data = buffer.getvalue()
logger.info(f"Test workbook size: {len(file_data)} bytes")
# Process with DocumentExtraction
result = await extractor.processFileData(
fileData=file_data,
fileName="test_workbook.xlsx",
mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
base64Encoded=False,
prompt=None,
enableAI=False
)
logger.info(f"Test workbook processing completed successfully!")
logger.info(f"Generated {len(result.contents)} content items:")
for i, content_item in enumerate(result.contents):
logger.info(f" Item {i+1}: {content_item.label}")
logger.info(f" MIME type: {content_item.metadata.mimeType}")
logger.info(f" Size: {content_item.metadata.size} bytes")
if content_item.data:
logger.info(f" Data preview: {content_item.data[:200]}...")
else:
logger.info(f" Data: None")
except ImportError as e:
logger.error(f"openpyxl not available: {e}")
except Exception as e:
logger.error(f"Error testing Excel functionality: {e}")
logger.info("Excel processing test completed!")
except ImportError as e:
logger.error(f"Failed to import required modules: {e}")
logger.error("Make sure you're running this script from the gateway directory")
except Exception as e:
logger.error(f"Unexpected error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(test_excel_processing())

View file

@ -1,658 +0,0 @@
#!/usr/bin/env python3
"""
Test script for MethodWeb class.
Tests all web actions: search, crawl, and scrape with various parameter sets.
Features:
- Tests web search functionality with different queries
- Tests web crawling with URL lists
- Tests web scraping (search + crawl combined)
- Detailed logging and progress tracking
- Error handling and validation testing
- Configuration validation
Usage:
- Interactive mode: python test_methodWeb.py
- Automated mode: python test_methodWeb.py --auto
- Verbose mode: python test_methodWeb.py --verbose
"""
import os
import asyncio
import logging
import sys
import argparse
import json
from pathlib import Path
from typing import List, Optional, Dict, Any
from datetime import datetime, UTC
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Filter out specific unwanted log messages
class LogFilter(logging.Filter):
"""Filter to hide specific unwanted log messages."""
def filter(self, record):
# Hide HTTP request info messages
if "HTTP Request:" in record.getMessage() and "POST https://api.tavily.com" in record.getMessage():
return False
# Hide HTTP response info messages
if "HTTP/1.1 200 OK" in record.getMessage():
return False
return True
# Apply the filter to the root logger
root_logger = logging.getLogger()
root_logger.addFilter(LogFilter())
def check_dependencies():
"""Check if required dependencies are available."""
missing_deps = []
# Check for required dependencies
try:
import tavily
logger.info("tavily-python is available")
except ImportError:
missing_deps.append("tavily-python")
logger.error("tavily-python is missing")
try:
import httpx
logger.info("httpx is available")
except ImportError:
missing_deps.append("httpx")
logger.error("httpx is missing")
if missing_deps:
logger.error("\n" + "="*60)
logger.error("MISSING DEPENDENCIES DETECTED!")
logger.error("="*60)
logger.error("The following packages are required but not installed:")
for dep in missing_deps:
logger.error(f" - {dep}")
logger.error("\nTo install all dependencies, run:")
logger.error("pip install -r requirements.txt")
logger.error("="*60)
return False
logger.info("All required dependencies are available!")
return True
def check_module_imports():
"""Check if we can import the required modules."""
try:
# Add the gateway directory to the path so we can import our modules
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
from modules.methods.methodWeb import MethodWeb
from modules.chat.serviceCenter import ServiceCenter
from modules.interfaces.interfaceAppModel import User, UserConnection, UserPrivilege, AuthAuthority
from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem, TaskStatus
from modules.shared.configuration import APP_CONFIG
logger.info("All required modules imported successfully")
return True
except ImportError as e:
logger.error(f"Failed to import required modules: {e}")
logger.error("Make sure you're running this script from the gateway directory")
return False
except Exception as e:
logger.error(f"Unexpected error importing modules: {e}")
return False
def check_configuration():
"""Check if required configuration is available."""
try:
from modules.shared.configuration import APP_CONFIG
# Check Tavily API key
tavily_api_key = APP_CONFIG.get("Connector_WebTavily_API_KEY")
if not tavily_api_key or tavily_api_key == "your_tavily_api_key_here":
logger.error("Tavily API key not configured!")
logger.error("Please set Connector_WebTavily_API_KEY in config.ini")
return False
logger.info("Tavily API key is configured")
# Check other web configuration
web_configs = [
"Web_Search_MAX_QUERY_LENGTH",
"Web_Search_MAX_RESULTS",
"Web_Search_MIN_RESULTS",
"Web_Crawl_TIMEOUT",
"Web_Crawl_MAX_RETRIES",
"Web_Crawl_RETRY_DELAY"
]
for config_key in web_configs:
value = APP_CONFIG.get(config_key)
if value:
logger.info(f"Configuration {config_key}: {value}")
else:
logger.warning(f"Configuration {config_key} not set, using default")
return True
except Exception as e:
logger.error(f"Failed to check configuration: {e}")
return False
def create_mock_service_center():
"""Create a proper ServiceCenter for testing purposes."""
try:
from modules.chat.serviceCenter import ServiceCenter
from modules.interfaces.interfaceAppModel import User, UserPrivilege, AuthAuthority
from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem, TaskStatus
from modules.interfaces.interfaceChatModel import ChatLog, ChatMessage, ChatStat
# Create proper user with all required fields
mock_user = User(
id="test_user_web_001",
username="testuser_web",
email="testweb@example.com",
fullName="Test Web User",
language="en",
enabled=True,
privilege=UserPrivilege.USER,
authenticationAuthority=AuthAuthority.LOCAL,
mandateId="test_mandate_web_001"
)
# Create proper workflow with all required fields
current_time = datetime.now(UTC).timestamp()
mock_workflow = ChatWorkflow(
id="test_workflow_web_001",
mandateId="test_mandate_web_001",
status="active",
name="Test Web Method Workflow",
currentRound=1,
lastActivity=current_time,
startedAt=current_time,
logs=[],
messages=[],
stats=None,
tasks=[]
)
# Create service center
service_center = ServiceCenter(mock_user, mock_workflow)
logger.info("ServiceCenter created successfully for web testing")
return service_center
except Exception as e:
logger.error(f"Failed to create ServiceCenter: {e}")
return None
class MethodWebTester:
"""Test class for MethodWeb functionality."""
def __init__(self):
"""Initialize the tester."""
self.method_web = None
self.service_center = None
# Test results tracking
self.test_results = []
# Test parameter sets
self.test_queries = [
"Python programming tutorial",
"FastAPI documentation",
"machine learning basics",
"web scraping best practices"
]
self.test_urls = [
"https://docs.python.org/3/tutorial/",
"https://fastapi.tiangolo.com/",
"https://scikit-learn.org/stable/",
"https://requests.readthedocs.io/en/latest/"
]
def initialize_method_web(self):
"""Initialize the MethodWeb instance with a proper ServiceCenter."""
try:
# First create the service center
self.service_center = create_mock_service_center()
if not self.service_center:
logger.error("Failed to create ServiceCenter!")
return False
# Now create MethodWeb with the service center
from modules.methods.methodWeb import MethodWeb
self.method_web = MethodWeb(self.service_center)
logger.info("MethodWeb initialized successfully with ServiceCenter")
return True
except Exception as e:
logger.error(f"Failed to initialize MethodWeb: {e}")
return False
async def test_search_action(self, query: str, max_results: int = 5) -> Dict[str, Any]:
"""Test the search action with given parameters."""
logger.info(f"Testing search action with query: '{query}', max_results: {max_results}")
try:
parameters = {
"query": query,
"maxResults": max_results
}
result = await self.method_web.search(parameters)
test_result = {
"action": "search",
"query": query,
"max_results": max_results,
"success": result.success,
"error": result.error if not result.success else None,
"documents_count": len(result.documents) if result.documents else 0,
"result_label": result.resultLabel if hasattr(result, 'resultLabel') else None
}
if result.success:
logger.info(f"✅ Search successful: {test_result['documents_count']} documents returned")
if result.documents:
for i, doc in enumerate(result.documents):
logger.info(f" Document {i+1}: {doc.documentName}")
if hasattr(doc, 'documentData') and hasattr(doc.documentData, 'results'):
logger.info(f" Results count: {len(doc.documentData.results)}")
else:
logger.error(f"❌ Search failed: {result.error}")
return test_result
except Exception as e:
error_msg = str(e)
logger.error(f"❌ Search action exception: {error_msg}")
return {
"action": "search",
"query": query,
"max_results": max_results,
"success": False,
"error": f"Exception: {error_msg}",
"documents_count": 0,
"result_label": None
}
async def test_crawl_action(self, urls: List[str]) -> Dict[str, Any]:
"""Test the crawl action with given URLs."""
logger.info(f"Testing crawl action with {len(urls)} URLs")
try:
# Monkeypatch the service to return a mock document and file data
class _MockDoc:
def __init__(self, fileId: str, fileName: str = "mock_search_results.json"):
self.fileId = fileId
self.fileName = fileName
def _mock_get_docs(_doc_ids):
return [_MockDoc(fileId="mock_file_id", fileName="mock_search_results.json")]
# Build minimal JSON structure expected by methodWeb.crawl
mock_payload = {
"documentData": {
"results": [{"url": u} for u in urls]
}
}
def _mock_get_file_data(_file_id):
return json.dumps(mock_payload).encode("utf-8")
# Apply monkeypatches to the method's service
self.method_web.service.getChatDocumentsFromDocumentList = _mock_get_docs
self.method_web.service.getFileData = _mock_get_file_data
# Use any string as the document list reference; service is mocked
parameters = {"documentList": "mock_document_list_ref"}
result = await self.method_web.crawl(parameters)
test_result = {
"action": "crawl",
"urls_count": len(urls),
"success": result.success,
"error": result.error if not result.success else None,
"documents_count": len(result.documents) if result.documents else 0,
"result_label": result.resultLabel if hasattr(result, 'resultLabel') else None
}
if result.success:
logger.info(f"✅ Crawl successful: {test_result['documents_count']} documents returned")
if result.documents:
for i, doc in enumerate(result.documents):
logger.info(f" Document {i+1}: {doc.documentName}")
else:
logger.error(f"❌ Crawl failed: {result.error}")
return test_result
except Exception as e:
error_msg = str(e)
logger.error(f"❌ Crawl action exception: {error_msg}")
return {
"action": "crawl",
"urls_count": len(urls),
"success": False,
"error": f"Exception: {error_msg}",
"documents_count": 0,
"result_label": None
}
async def test_scrape_action(self, query: str, max_results: int = 3) -> Dict[str, Any]:
"""Test the scrape action (search + crawl combined) with given parameters."""
logger.info(f"Testing scrape action with query: '{query}', max_results: {max_results}")
try:
parameters = {
"query": query,
"maxResults": max_results
}
result = await self.method_web.scrape(parameters)
test_result = {
"action": "scrape",
"query": query,
"max_results": max_results,
"success": result.success,
"error": result.error if not result.success else None,
"documents_count": len(result.documents) if result.documents else 0,
"result_label": result.resultLabel if hasattr(result, 'resultLabel') else None
}
if result.success:
logger.info(f"✅ Scrape successful: {test_result['documents_count']} documents returned")
if result.documents:
for i, doc in enumerate(result.documents):
logger.info(f" Document {i+1}: {doc.documentName}")
if hasattr(doc, 'documentData') and hasattr(doc.documentData, 'results'):
logger.info(f" Results count: {len(doc.documentData.results)}")
else:
logger.error(f"❌ Scrape failed: {result.error}")
return test_result
except Exception as e:
error_msg = str(e)
logger.error(f"❌ Scrape action exception: {error_msg}")
return {
"action": "scrape",
"query": query,
"max_results": max_results,
"success": False,
"error": f"Exception: {error_msg}",
"documents_count": 0,
"result_label": None
}
async def test_parameter_validation(self) -> List[Dict[str, Any]]:
"""Test parameter validation with invalid inputs."""
logger.info("Testing parameter validation with invalid inputs")
validation_tests = []
# Test 1: Empty query
logger.info("Test 1: Empty query")
result = await self.test_search_action("", 5)
# For validation tests, we expect the request to fail with validation error
if not result["success"] and "validation error" in result.get("error", "").lower():
result["success"] = True # Mark as successful validation test
result["validation_test"] = True
result["expected_behavior"] = "Correctly rejected empty query"
logger.info("✅ Validation test PASSED: Empty query correctly rejected")
validation_tests.append(result)
# Test 2: Query too long (over 400 characters)
long_query = "a" * 500
logger.info("Test 2: Query too long")
result = await self.test_search_action(long_query, 5)
if not result["success"] and "validation error" in result.get("error", "").lower():
result["success"] = True # Mark as successful validation test
result["validation_test"] = True
result["expected_behavior"] = "Correctly rejected overly long query"
logger.info("✅ Validation test PASSED: Long query correctly rejected")
validation_tests.append(result)
# Test 3: Max results too high
logger.info("Test 3: Max results too high")
result = await self.test_search_action("test", 25)
if not result["success"] and "validation error" in result.get("error", "").lower():
result["success"] = True # Mark as successful validation test
result["validation_test"] = True
result["expected_behavior"] = "Correctly rejected excessive max results"
logger.info("✅ Validation test PASSED: High max results correctly rejected")
validation_tests.append(result)
# Test 4: Max results too low
logger.info("Test 4: Max results too low")
result = await self.test_search_action("test", 0)
if not result["success"] and "validation error" in result.get("error", "").lower():
result["success"] = True # Mark as successful validation test
result["validation_test"] = True
result["expected_behavior"] = "Correctly rejected zero max results"
logger.info("✅ Validation test PASSED: Zero max results correctly rejected")
validation_tests.append(result)
return validation_tests
async def run_all_tests(self) -> None:
"""Run all web method tests."""
logger.info("Starting MethodWeb comprehensive tests")
logger.info("=" * 60)
# Initialize the method
if not self.initialize_method_web():
logger.error("Cannot proceed without MethodWeb!")
return
# Test 1: Search actions with different queries
logger.info("\n" + "=" * 60)
logger.info("TEST 1: SEARCH ACTIONS")
logger.info("=" * 60)
for i, query in enumerate(self.test_queries):
logger.info(f"\nSearch test {i+1}/{len(self.test_queries)}")
result = await self.test_search_action(query, 3)
self.test_results.append(result)
await asyncio.sleep(1) # Rate limiting
# Test 2: Scrape actions (search + crawl combined)
logger.info("\n" + "=" * 60)
logger.info("TEST 2: SCRAPE ACTIONS")
logger.info("=" * 60)
scrape_queries = self.test_queries[:2] # Use first 2 queries for scraping
for i, query in enumerate(scrape_queries):
logger.info(f"\nScrape test {i+1}/{len(scrape_queries)}")
result = await self.test_scrape_action(query, 2)
self.test_results.append(result)
await asyncio.sleep(2) # Rate limiting for scraping
# Test 3: Parameter validation
logger.info("\n" + "=" * 60)
logger.info("TEST 3: PARAMETER VALIDATION")
logger.info("=" * 60)
validation_results = await self.test_parameter_validation()
self.test_results.extend(validation_results)
# Test 4: Crawl action (if we have search results)
logger.info("\n" + "=" * 60)
logger.info("TEST 4: CRAWL ACTIONS")
logger.info("=" * 60)
logger.info("Testing crawl with sample URLs")
result = await self.test_crawl_action(self.test_urls[:2])
self.test_results.append(result)
# Print comprehensive summary
self.print_test_summary()
def print_test_summary(self):
"""Print comprehensive test summary."""
logger.info("\n" + "=" * 80)
logger.info("COMPREHENSIVE TEST SUMMARY")
logger.info("=" * 80)
total_tests = len(self.test_results)
successful_tests = sum(1 for result in self.test_results if result["success"])
failed_tests = total_tests - successful_tests
logger.info(f"Total tests run: {total_tests}")
logger.info(f"Successful: {successful_tests}")
logger.info(f"Failed: {failed_tests}")
logger.info(f"Success rate: {(successful_tests/total_tests)*100:.1f}%")
# Group results by action type
action_groups = {}
for result in self.test_results:
action = result["action"]
if action not in action_groups:
action_groups[action] = []
action_groups[action].append(result)
logger.info("\n" + "-" * 80)
logger.info("RESULTS BY ACTION TYPE:")
logger.info("-" * 80)
for action, results in action_groups.items():
action_successful = sum(1 for r in results if r["success"])
action_total = len(results)
logger.info(f"\n{action.upper()} ACTIONS:")
logger.info(f" Total: {action_total}, Successful: {action_successful}, Failed: {action_total - action_successful}")
for i, result in enumerate(results):
status_icon = "" if result["success"] else ""
# Handle validation tests specially
if result.get("validation_test", False):
logger.info(f" {status_icon} Validation Test {i+1}: {result.get('expected_behavior', 'Validation working correctly')}")
if result.get("error"):
logger.info(f" Validation Error: {result['error']}")
elif action == "search":
logger.info(f" {status_icon} Test {i+1}: '{result['query']}' -> {result['documents_count']} docs")
elif action == "scrape":
logger.info(f" {status_icon} Test {i+1}: '{result['query']}' -> {result['documents_count']} docs")
elif action == "crawl":
logger.info(f" {status_icon} Test {i+1}: {result['urls_count']} URLs -> {result['documents_count']} docs")
if not result["success"] and not result.get("validation_test", False):
logger.info(f" Error: {result['error']}")
logger.info("\n" + "-" * 80)
logger.info("CONFIGURATION STATUS:")
logger.info("-" * 80)
try:
from modules.shared.configuration import APP_CONFIG
tavily_key = APP_CONFIG.get("Connector_WebTavily_API_KEY")
if tavily_key and tavily_key != "your_tavily_api_key_here":
logger.info("✅ Tavily API key: Configured")
else:
logger.info("❌ Tavily API key: Not configured")
web_configs = [
("Web_Search_MAX_QUERY_LENGTH", "400"),
("Web_Search_MAX_RESULTS", "20"),
("Web_Search_MIN_RESULTS", "1"),
("Web_Crawl_TIMEOUT", "30"),
("Web_Crawl_MAX_RETRIES", "3"),
("Web_Crawl_RETRY_DELAY", "2")
]
for config_key, default_value in web_configs:
value = APP_CONFIG.get(config_key, default_value)
logger.info(f"{config_key}: {value}")
except Exception as e:
logger.error(f"❌ Configuration check failed: {e}")
logger.info("=" * 80)
def parse_arguments():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description='MethodWeb Test Script')
parser.add_argument('--auto', action='store_true',
help='Run tests automatically without user interaction')
parser.add_argument('--verbose', '-v', action='store_true',
help='Enable verbose logging')
parser.add_argument('--quick', action='store_true',
help='Run quick tests with fewer queries')
return parser.parse_args()
async def main():
"""Main function to run the tests."""
# Parse command line arguments
args = parse_arguments()
# Set logging level based on verbosity
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.getLogger().setLevel(logging.INFO)
logger.info("MethodWeb Test Script")
logger.info("=" * 50)
# Check dependencies first
if not check_dependencies():
logger.error("Please install missing dependencies before running tests.")
return
# Check module imports
if not check_module_imports():
logger.error("Cannot import required modules. Please check your setup.")
return
# Check configuration
if not check_configuration():
logger.error("Configuration check failed. Please check your config.ini file.")
return
# Determine test mode
if args.auto:
logger.info("Running in automated mode")
else:
# Interactive mode: ask user for confirmation
print("\n" + "=" * 50)
print("METHODWEB TEST SCRIPT")
print("=" * 50)
print("This script will test the MethodWeb functionality including:")
print("- Web search actions")
print("- Web scraping actions")
print("- Web crawling actions")
print("- Parameter validation")
print("=" * 50)
try:
choice = input("Do you want to proceed? (y/N): ").strip().lower()
if choice not in ['y', 'yes']:
logger.info("Test cancelled by user")
return
except (EOFError, KeyboardInterrupt):
logger.info("Test cancelled by user")
return
# Create tester and run tests
tester = MethodWebTester()
# Modify test queries for quick mode
if args.quick:
tester.test_queries = tester.test_queries[:2] # Use only first 2 queries
logger.info("Running in quick mode with reduced test set")
await tester.run_all_tests()
if __name__ == "__main__":
# Run the tests
asyncio.run(main())

View file

@ -1,51 +0,0 @@
#!/usr/bin/env python3
"""
Test script for Outlook filter logic
"""
def test_build_graph_filter():
"""Test the filter building logic"""
# Mock the _buildGraphFilter method
def _buildGraphFilter(filter_text):
if not filter_text:
return {}
filter_text = filter_text.strip()
# Handle email address filters
if '@' in filter_text and '.' in filter_text and ' ' not in filter_text:
return {"$filter": f"from/fromAddress/address eq '{filter_text}'"}
# Handle search queries (from:, to:, subject:, etc.)
if any(filter_text.startswith(prefix) for prefix in ['from:', 'to:', 'subject:', 'received:', 'hasattachment:']):
return {"$search": f'"{filter_text}"'}
# Handle text content - search in subject
return {"$filter": f"contains(subject,'{filter_text}')"}
# Test cases
test_cases = [
("peter.muster@domain.com", {"$filter": "from/fromAddress/address eq 'peter.muster@domain.com'"}),
("from:user@example.com", {"$search": '"from:user@example.com"'}),
("subject:meeting", {"$search": '"subject:meeting"'}),
("project update", {"$filter": "contains(subject,'project update')"}),
("", {}),
(" hello world ", {"$filter": "contains(subject,'hello world')"}),
]
print("Testing Outlook filter logic:")
print("=" * 50)
for test_input, expected_output in test_cases:
result = _buildGraphFilter(test_input)
status = "✓ PASS" if result == expected_output else "✗ FAIL"
print(f"{status} | Input: '{test_input}'")
print(f" | Expected: {expected_output}")
print(f" | Got: {result}")
print()
print("Test completed!")
if __name__ == "__main__":
test_build_graph_filter()

View file

@ -1,70 +0,0 @@
#!/usr/bin/env python3
"""
Test script for fixed Outlook filter logic
"""
def test_build_graph_filter():
"""Test the corrected filter building logic"""
# Mock the corrected _buildGraphFilter method
def _buildGraphFilter(filter_text):
if not filter_text:
return {}
filter_text = filter_text.strip()
# Handle search queries (from:, to:, subject:, etc.) - check this FIRST
if any(filter_text.startswith(prefix) for prefix in ['from:', 'to:', 'subject:', 'received:', 'hasattachment:']):
return {"$search": f'"{filter_text}"'}
# Handle email address filters (only if it's NOT a search query)
if '@' in filter_text and '.' in filter_text and ' ' not in filter_text and not filter_text.startswith('from:'):
return {"$filter": f"from/fromAddress/address eq '{filter_text}'"}
# Handle text content - search in subject
return {"$filter": f"contains(subject,'{filter_text}')"}
# Test cases
test_cases = [
("peter.muster@domain.com", {"$filter": "from/fromAddress/address eq 'peter.muster@domain.com'"}),
("from:user@example.com", {"$search": '"from:user@example.com"'}),
("subject:meeting", {"$search": '"subject:meeting"'}),
("project update", {"$filter": "contains(subject,'project update')"}),
("", {}),
(" hello world ", {"$filter": "contains(subject,'hello world')"}),
# Additional edge cases
("to:manager@company.com", {"$search": '"to:manager@company.com"'}),
("received:today", {"$search": '"received:today"'}),
("hasattachment:true", {"$search": '"hasattachment:true"'}),
("user@domain.com", {"$filter": "from/fromAddress/address eq 'user@domain.com'"}),
("from:user@domain.com subject:budget", {"$search": '"from:user@domain.com subject:budget"'}),
]
print("Testing FIXED Outlook filter logic:")
print("=" * 50)
passed = 0
failed = 0
for test_input, expected_output in test_cases:
result = _buildGraphFilter(test_input)
status = "✓ PASS" if result == expected_output else "✗ FAIL"
if result == expected_output:
passed += 1
else:
failed += 1
print(f"{status} | Input: '{test_input}'")
print(f" | Expected: {expected_output}")
print(f" | Got: {result}")
print()
print(f"Test completed! {passed} passed, {failed} failed")
if failed == 0:
print("🎉 All tests passed!")
else:
print("❌ Some tests failed. Please check the logic.")
if __name__ == "__main__":
test_build_graph_filter()

View file

@ -1,100 +0,0 @@
#!/usr/bin/env python3
"""
Test script for Pydantic compatibility module.
This script tests the version-aware functionality for both Pydantic v1 and v2.
"""
import sys
import os
# Add the modules directory to the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'modules'))
def test_compatibility_module():
"""Test the Pydantic compatibility module"""
try:
from shared.pydanticCompat import (
PYDANTIC_VERSION,
create_private_field,
create_model_config,
model_to_dict,
model_from_dict,
get_version_info
)
print(f"✅ Successfully imported Pydantic compatibility module")
print(f"📊 Pydantic version detected: {PYDANTIC_VERSION}")
# Test version info
version_info = get_version_info()
print(f"🔍 Version info: {version_info}")
# Test field creation
private_field = create_private_field(default="test")
print(f"✅ Private field created: {type(private_field)}")
# Test model config
config = create_model_config(validate_assignment=True)
print(f"✅ Model config created: {type(config)}")
return True
except Exception as e:
print(f"❌ Error testing compatibility module: {e}")
return False
def test_chat_document_model():
"""Test the ChatDocument model with compatibility"""
try:
from interfaces.interfaceChatModel import ChatDocument
print(f"✅ Successfully imported ChatDocument model")
# Test creating a document
doc = ChatDocument(fileId="test-file-123")
print(f"✅ ChatDocument created: {doc.id}")
# Test setting component interface
doc.setComponentInterface("mock_interface")
print(f"✅ Component interface set")
# Test serialization
doc_dict = doc.to_dict()
print(f"✅ Document serialized: {doc_dict}")
# Test validation
is_valid = doc.validate_component_interface()
print(f"✅ Component interface validation: {is_valid}")
return True
except Exception as e:
print(f"❌ Error testing ChatDocument model: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Main test function"""
print("🧪 Testing Pydantic Compatibility Module")
print("=" * 50)
# Test compatibility module
compat_ok = test_compatibility_module()
print()
# Test ChatDocument model
model_ok = test_chat_document_model()
print()
# Summary
print("=" * 50)
if compat_ok and model_ok:
print("🎉 All tests passed! Pydantic compatibility is working correctly.")
return 0
else:
print("💥 Some tests failed. Check the errors above.")
return 1
if __name__ == "__main__":
sys.exit(main())

View file

@ -1,207 +0,0 @@
#!/usr/bin/env python3
"""
Test script for web CSV functionality
Tests both CSV output generation and CSV input reading
"""
import sys
import os
import asyncio
from typing import Dict, Any
# Add the gateway directory to the Python path
sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
from modules.methods.methodWeb import MethodWeb
from modules.interfaces.interfaceWebModel import WebSearchResultItem, WebSearchDocumentData, WebSearchActionDocument, WebSearchActionResult
from pydantic import HttpUrl
def create_mock_web_search_result():
"""Create a mock WebSearchActionResult with the provided example data"""
# Create mock search result items based on the provided example
results = [
WebSearchResultItem(
title="Switzerland Market Analysis :: Fitch Solutions",
url=HttpUrl("https://www.fitchsolutions.com/bmi/region/switzerland")
),
WebSearchResultItem(
title="OECD Economic Outlook, Volume 2024 Issue 2: Switzerland",
url=HttpUrl("https://www.oecd.org/en/publications/2024/12/oecd-economic-outlook-volume-2024-issue-2_67bb8fac/full-report/switzerland_605fd31f.html")
),
WebSearchResultItem(
title="The economic context of Switzerland - International Trade Portal",
url=HttpUrl("https://www.lloydsbanktrade.com/en/market-potential/switzerland/economical-context")
),
WebSearchResultItem(
title="Switzerland: Country File, Economic Risk Analysis | Coface",
url=HttpUrl("https://www.coface.com/news-economy-and-insights/business-risk-dashboard/country-risk-files/switzerland")
),
WebSearchResultItem(
title="Swiss Economic Outlook 2025 - Roland Berger",
url=HttpUrl("https://www.rolandberger.com/en/Insights/Publications/Swiss-Economic-Outlook-2025.html")
)
]
# Create document data
document_data = WebSearchDocumentData(
query="current market trends Switzerland business economy 2024 analysis report",
results=results,
total_count=len(results)
)
# Create action document
action_document = WebSearchActionDocument(
documentName="test_search_results.json",
documentData=document_data,
mimeType="application/json"
)
# Create action result
action_result = WebSearchActionResult(
success=True,
documents=[action_document]
)
return action_result
def test_csv_output_generation():
"""Test CSV output generation from web search results"""
print("Testing CSV output generation...")
# Create method instance (without service center for testing)
method = MethodWeb(None)
# Create mock search result
mock_result = create_mock_web_search_result()
# Convert to CSV
csv_content = method._convert_search_results_to_csv(mock_result)
print("Generated CSV content:")
print(csv_content)
print()
# Verify CSV format
lines = csv_content.strip().split('\n')
assert len(lines) == 6, f"Expected 6 lines (header + 5 results), got {len(lines)}"
# Check header
assert lines[0] == "url;title", f"Expected header 'url;title', got '{lines[0]}'"
# Check that URLs are present
for i, line in enumerate(lines[1:], 1):
parts = line.split(';')
assert len(parts) == 2, f"Line {i} should have 2 parts separated by ';', got {len(parts)}"
url, title = parts
assert url.startswith('https://'), f"Line {i} URL should start with 'https://', got '{url}'"
assert title, f"Line {i} should have a title, got empty title"
print("✓ CSV output generation test passed!")
return csv_content
def test_csv_input_reading():
"""Test CSV input reading functionality"""
print("Testing CSV input reading...")
# Create method instance
method = MethodWeb(None)
# Test semicolon-separated CSV
semicolon_csv = """url;title
https://www.fitchsolutions.com/bmi/region/switzerland;Switzerland Market Analysis :: Fitch Solutions
https://www.oecd.org/en/publications/2024/12/oecd-economic-outlook-volume-2024-issue-2_67bb8fac/full-report/switzerland_605fd31f.html;OECD Economic Outlook, Volume 2024 Issue 2: Switzerland
https://www.lloydsbanktrade.com/en/market-potential/switzerland/economical-context;The economic context of Switzerland - International Trade Portal"""
urls_semicolon = method._read_csv_with_urls(semicolon_csv)
print(f"Extracted {len(urls_semicolon)} URLs from semicolon CSV:")
for url in urls_semicolon:
print(f" - {url}")
assert len(urls_semicolon) == 3, f"Expected 3 URLs, got {len(urls_semicolon)}"
assert all(url.startswith('https://') for url in urls_semicolon), "All URLs should start with https://"
print("✓ Semicolon CSV reading test passed!")
# Test comma-separated CSV
comma_csv = """url,title
https://www.fitchsolutions.com/bmi/region/switzerland,Switzerland Market Analysis :: Fitch Solutions
https://www.oecd.org/en/publications/2024/12/oecd-economic-outlook-volume-2024-issue-2_67bb8fac/full-report/switzerland_605fd31f.html,OECD Economic Outlook, Volume 2024 Issue 2: Switzerland"""
urls_comma = method._read_csv_with_urls(comma_csv)
print(f"Extracted {len(urls_comma)} URLs from comma CSV:")
for url in urls_comma:
print(f" - {url}")
assert len(urls_comma) == 2, f"Expected 2 URLs, got {len(urls_comma)}"
assert all(url.startswith('https://') for url in urls_comma), "All URLs should start with https://"
print("✓ Comma CSV reading test passed!")
# Test case-insensitive column names
case_insensitive_csv = """URL;Title
https://example.com/test;Test Title"""
urls_case = method._read_csv_with_urls(case_insensitive_csv)
assert len(urls_case) == 1, f"Expected 1 URL, got {len(urls_case)}"
assert urls_case[0] == "https://example.com/test", f"Expected 'https://example.com/test', got '{urls_case[0]}'"
print("✓ Case-insensitive CSV reading test passed!")
def test_integration():
"""Test the complete integration: generate CSV and then read it back"""
print("Testing integration: generate CSV and read it back...")
method = MethodWeb(None)
# Generate CSV from mock data
mock_result = create_mock_web_search_result()
csv_content = method._convert_search_results_to_csv(mock_result)
# Read URLs back from the generated CSV
extracted_urls = method._read_csv_with_urls(csv_content)
print(f"Generated CSV with {len(mock_result.documents[0].documentData.results)} results")
print(f"Extracted {len(extracted_urls)} URLs from generated CSV")
# Verify we got the same number of URLs
assert len(extracted_urls) == len(mock_result.documents[0].documentData.results), \
f"Expected {len(mock_result.documents[0].documentData.results)} URLs, got {len(extracted_urls)}"
# Verify URLs match
original_urls = [str(result.url) for result in mock_result.documents[0].documentData.results]
for i, (original, extracted) in enumerate(zip(original_urls, extracted_urls)):
assert original == extracted, f"URL {i} mismatch: expected '{original}', got '{extracted}'"
print("✓ Integration test passed!")
if __name__ == "__main__":
print("Running Web CSV Functionality Tests")
print("=" * 50)
try:
# Test CSV output generation
csv_content = test_csv_output_generation()
print()
# Test CSV input reading
test_csv_input_reading()
print()
# Test integration
test_integration()
print()
print("=" * 50)
print("🎉 All tests passed successfully!")
except Exception as e:
print(f"❌ Test failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)