diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index 65bae155..cb54a42b 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -14,10 +14,6 @@ from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, from modules.datamodels.datamodelDocument import RenderedDocument from modules.interfaces.interfaceAiObjects import AiObjects from modules.shared.jsonUtils import ( - extractJsonString, - repairBrokenJson, - extractSectionsFromDocument, - buildContinuationContext, parseJsonWithModel ) from modules.services.serviceAi.subJsonResponseHandling import JsonResponseHandler @@ -209,7 +205,7 @@ Respond with ONLY a JSON object in this exact format: processingMode=ProcessingModeEnum.BASIC ) - async def _callAiWithLooping( + async def callAiWithLooping( self, prompt: str, options: AiCallOptions, @@ -218,11 +214,12 @@ Respond with ONLY a JSON object in this exact format: promptArgs: Optional[Dict[str, Any]] = None, operationId: Optional[str] = None, userPrompt: Optional[str] = None, - contentParts: Optional[List[ContentPart]] = None # ARCHITECTURE: Support ContentParts for large content + contentParts: Optional[List[ContentPart]] = None, # ARCHITECTURE: Support ContentParts for large content + useCaseId: Optional[str] = None # REQUIRED: Explicit use case ID for generic looping system ) -> str: - """Delegate to AiCallLooper.""" + """Public method: Delegate to AiCallLooper for AI calls with looping support.""" return await self.aiCallLooper.callAiWithLooping( - prompt, options, debugPrefix, promptBuilder, promptArgs, operationId, userPrompt, contentParts + prompt, options, debugPrefix, promptBuilder, promptArgs, operationId, userPrompt, contentParts, useCaseId ) async def _defineKpisFromPrompt( @@ -341,49 +338,21 @@ Respond with ONLY a JSON object in this exact format: prompt: str, options: AiCallOptions, title: Optional[str], - aiOperationId: str + parentOperationId: Optional[str] ) -> AiResponse: - """Handle IMAGE_GENERATE operation type.""" - self.services.chat.progressLogUpdate(aiOperationId, 0.4, "Calling AI for image generation") + """Handle IMAGE_GENERATE operation type using image generation path.""" + from modules.services.serviceGeneration.paths.imagePath import ImageGenerationPath - request = AiCallRequest( - prompt=prompt, - context="", - options=options - ) + imagePath = ImageGenerationPath(self.services) - response = await self.callAi(request) + # Extract format from options + format = options.resultFormat or "png" - if not response.content: - errorMsg = f"No image data returned: {response.content}" - logger.error(f"Error in AI image generation: {errorMsg}") - self.services.chat.progressLogFinish(aiOperationId, False) - raise ValueError(errorMsg) - - imageDoc = DocumentData( - documentName="generated_image.png", - documentData=response.content, - mimeType="image/png" - ) - - metadata = AiResponseMetadata( - title=title or "Generated Image", - operationType=options.operationType.value - ) - - self.services.chat.storeWorkflowStat( - self.services.workflow, - response, - "ai.generate.image" - ) - - self.services.chat.progressLogUpdate(aiOperationId, 0.9, "Image generated") - self.services.chat.progressLogFinish(aiOperationId, True) - - return AiResponse( - content=response.content, - metadata=metadata, - documents=[imageDoc] + return await imagePath.generateImages( + userPrompt=prompt, + format=format, + title=title, + parentOperationId=parentOperationId ) async def _handleWebOperation( @@ -441,54 +410,54 @@ Respond with ONLY a JSON object in this exact format: return intent return None - async def _clarifyDocumentIntents( + async def clarifyDocumentIntents( self, documents: List[ChatDocument], userPrompt: str, actionParameters: Dict[str, Any], parentOperationId: str ) -> List[DocumentIntent]: - """Delegate to DocumentIntentAnalyzer.""" + """Public method: Delegate to DocumentIntentAnalyzer.""" return await self.intentAnalyzer.clarifyDocumentIntents( documents, userPrompt, actionParameters, parentOperationId ) - async def _extractAndPrepareContent( + async def extractAndPrepareContent( self, documents: List[ChatDocument], documentIntents: List[DocumentIntent], parentOperationId: str ) -> List[ContentPart]: - """Delegate to ContentExtractor.""" + """Public method: Delegate to ContentExtractor.""" return await self.contentExtractor.extractAndPrepareContent( documents, documentIntents, parentOperationId, self._getIntentForDocument ) - async def _generateStructure( + async def generateStructure( self, userPrompt: str, contentParts: List[ContentPart], outputFormat: str, parentOperationId: str ) -> Dict[str, Any]: - """Delegate to StructureGenerator.""" + """Public method: Delegate to StructureGenerator.""" return await self.structureGenerator.generateStructure( userPrompt, contentParts, outputFormat, parentOperationId ) - async def _fillStructure( + async def fillStructure( self, structure: Dict[str, Any], contentParts: List[ContentPart], userPrompt: str, parentOperationId: str ) -> Dict[str, Any]: - """Delegate to StructureFiller.""" + """Public method: Delegate to StructureFiller.""" return await self.structureFiller.fillStructure( structure, contentParts, userPrompt, parentOperationId ) - async def _renderResult( + async def renderResult( self, filledStructure: Dict[str, Any], outputFormat: str, @@ -577,13 +546,14 @@ Respond with ONLY a JSON object in this exact format: documentIntents: Optional[List[DocumentIntent]] = None, outputFormat: Optional[str] = None, title: Optional[str] = None, - parentOperationId: Optional[str] = None + parentOperationId: Optional[str] = None, + generationIntent: Optional[str] = None # NEW: Explicit intent from action (skips detection) ) -> AiResponse: """ - Einheitliche AI-Content-Verarbeitung - Single Entry Point für alle AI-Actions. + Unified AI content generation with explicit intent requirement. - Alle AI-Actions (ai.process, ai.generateDocument, etc.) routen hier durch. - Sie unterscheiden sich nur in Parametern, nicht in Logik. + All AI-Actions (ai.process, ai.generateDocument, etc.) route through here. + They differ only in parameters, not in logic. Args: prompt: The main prompt for the AI call @@ -594,6 +564,8 @@ Respond with ONLY a JSON object in this exact format: outputFormat: Optional output format for document generation (e.g., 'pdf', 'docx', 'xlsx') title: Optional title for generated documents parentOperationId: Optional parent operation ID for hierarchical logging + generationIntent: REQUIRED explicit intent ("document" | "code" | "image") from action. + NO auto-detection - actions must explicitly specify intent. Returns: AiResponse with content, metadata, and optional documents @@ -625,111 +597,73 @@ Respond with ONLY a JSON object in this exact format: # Route zu Operation-spezifischen Handlern if opType == OperationTypeEnum.IMAGE_GENERATE: - return await self._handleImageGeneration(prompt, options, title, aiOperationId) + # Image generation - route to image path + return await self._handleImageGeneration(prompt, options, title, parentOperationId) if opType == OperationTypeEnum.WEB_SEARCH or opType == OperationTypeEnum.WEB_CRAWL: return await self._handleWebOperation(prompt, options, opType, aiOperationId) - # Dokument-Generierungs-Pfad - options.compressPrompt = False - options.compressContext = False - - # Schritt 5A: Kläre Dokument-Intents - documents = [] - if documentList: - documents = self.services.chat.getChatDocumentsFromDocumentList(documentList) - - if not documentIntents and documents: - documentIntents = await self._clarifyDocumentIntents( - documents, - prompt, - {"outputFormat": outputFormat}, - aiOperationId - ) - - # Schritt 5B: Extrahiere und bereite Content vor - if documents: - preparedContentParts = await self._extractAndPrepareContent( - documents, - documentIntents or [], - aiOperationId - ) - - # Merge mit bereitgestellten contentParts (falls vorhanden) - if contentParts: - # Prüfe auf pre-extracted Content - for part in contentParts: - if part.metadata.get("skipExtraction", False): - # Bereits extrahiert - verwende as-is, stelle sicher dass Metadaten vollständig - part.metadata.setdefault("contentFormat", "extracted") - part.metadata.setdefault("isPreExtracted", True) - preparedContentParts.extend(contentParts) - - contentParts = preparedContentParts - - # Schritt 5C: Generiere Struktur - structure = await self._generateStructure( - prompt, - contentParts or [], - outputFormat, - aiOperationId - ) - - # Schritt 5D: Fülle Struktur - # Language will be extracted from services (user intention analysis) in fillStructure - filledStructure = await self._fillStructure( - structure, - contentParts or [], - prompt, - aiOperationId - ) - - # Schritt 5E: Rendere Resultat - # Jedes Dokument wird einzeln gerendert, kann 1..n Dateien zurückgeben (z.B. HTML + Bilder) - renderedDocuments = await self._renderResult( - filledStructure, - outputFormat, - title or "Generated Document", - prompt, - aiOperationId - ) - - # Baue Response: Konvertiere alle gerenderten Dokumente zu DocumentData - documentDataList = [] - for renderedDoc in renderedDocuments: - try: - # Erstelle DocumentData für jedes gerenderte Dokument - docDataObj = DocumentData( - documentName=renderedDoc.filename, - documentData=renderedDoc.documentData, - mimeType=renderedDoc.mimeType, - sourceJson=filledStructure if len(documentDataList) == 0 else None # Nur für erstes Dokument + # Data generation - REQUIRES explicit generationIntent + if opType == OperationTypeEnum.DATA_GENERATE: + if not generationIntent: + errorMsg = ( + "generationIntent is required for DATA_GENERATE operation. " + "Actions must explicitly specify 'document' or 'code' intent. " + "No auto-detection - use qualified actions (ai.generateDocument, ai.generateCode)." + ) + logger.error(errorMsg) + self.services.chat.progressLogFinish(aiOperationId, False) + raise ValueError(errorMsg) + + # Route based on explicit intent (no auto-detection, no fallback) + if generationIntent == "code": + # Route to code generation path + return await self._handleCodeGeneration( + prompt=prompt, + options=options, + contentParts=contentParts, + outputFormat=outputFormat, + title=title, + parentOperationId=parentOperationId + ) + else: + # Route to document generation path (existing behavior) + return await self._handleDocumentGeneration( + prompt=prompt, + options=options, + documentList=documentList, + documentIntents=documentIntents, + contentParts=contentParts, + outputFormat=outputFormat, + title=title, + parentOperationId=parentOperationId ) - documentDataList.append(docDataObj) - logger.debug(f"Added rendered document: {renderedDoc.filename} ({len(renderedDoc.documentData)} bytes, {renderedDoc.mimeType})") - except Exception as e: - logger.warning(f"Error creating document {renderedDoc.filename}: {str(e)}") - if not documentDataList: - raise ValueError("No documents were rendered") + # DATA_EXTRACT: Extract content from documents and process with AI (no structure generation) + if opType == OperationTypeEnum.DATA_EXTRACT: + return await self._handleDataExtraction( + prompt=prompt, + options=options, + documentList=documentList, + documentIntents=documentIntents, + contentParts=contentParts, + outputFormat=outputFormat, + title=title, + parentOperationId=parentOperationId + ) - metadata = AiResponseMetadata( - title=title or filledStructure.get("metadata", {}).get("title", "Generated Document"), - operationType=opType.value - ) - - # Debug-Log (harmonisiert) - self.services.utils.writeDebugFile( - json.dumps(filledStructure, indent=2, ensure_ascii=False, default=str), - "document_generation_response" - ) - - self.services.chat.progressLogFinish(aiOperationId, True) - - return AiResponse( - content=json.dumps(filledStructure), - metadata=metadata, - documents=documentDataList + # Other operation types (DATA_ANALYSE, etc.) - existing logic + # Fallback to document generation for backward compatibility (should not happen) + logger.warning(f"Unhandled operation type: {opType}, falling back to document generation") + return await self._handleDocumentGeneration( + prompt=prompt, + options=options, + documentList=documentList, + documentIntents=documentIntents, + contentParts=contentParts, + outputFormat=outputFormat, + title=title, + parentOperationId=parentOperationId ) except Exception as e: @@ -737,6 +671,166 @@ Respond with ONLY a JSON object in this exact format: self.services.chat.progressLogFinish(aiOperationId, False) raise + async def _handleDataExtraction( + self, + prompt: str, + options: AiCallOptions, + documentList: Optional[Any], + documentIntents: Optional[List[DocumentIntent]], + contentParts: Optional[List[ContentPart]], + outputFormat: str, + title: str, + parentOperationId: Optional[str] + ) -> AiResponse: + """ + Handle DATA_EXTRACT: Extract content from documents (no AI), then process with AI. + This is the original flow: extract all documents first, then process contentParts with AI. + """ + import time + + # Create operation ID + workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" + extractOperationId = f"data_extract_{workflowId}_{int(time.time())}" + + # Start progress tracking + self.services.chat.progressLogStart( + extractOperationId, + "Data Extraction", + "Extraction", + f"Format: {outputFormat}", + parentOperationId=parentOperationId + ) + + try: + # Step 1: Get documents from documentList + documents = [] + if documentList: + documents = self.services.chat.getChatDocumentsFromDocumentList(documentList) + + # Step 2: Clarify document intents (if not provided) - REQUIRED for all documents + if not documentIntents and documents: + documentIntents = await self.clarifyDocumentIntents( + documents, + prompt, + {"outputFormat": outputFormat}, + extractOperationId + ) + + # Step 3: Extract and prepare content (NO AI - pure extraction) - REQUIRED for all documents + if documents: + preparedContentParts = await self.extractAndPrepareContent( + documents, + documentIntents or [], + extractOperationId + ) + + # Merge with provided contentParts (if any) + if contentParts: + for part in contentParts: + if part.metadata.get("skipExtraction", False): + part.metadata.setdefault("contentFormat", "extracted") + part.metadata.setdefault("isPreExtracted", True) + preparedContentParts.extend(contentParts) + + contentParts = preparedContentParts + + # Step 4: Process extracted contentParts with AI (simple text processing, no structure generation) + if not contentParts: + raise ValueError("No content extracted from documents") + + # Use simple AI call to process extracted content + # Prepare content for AI processing + contentText = "\n\n".join([ + f"[Document: {part.metadata.get('documentName', 'Unknown')}]\n{part.data}" + for part in contentParts + if part.data + ]) + + # Call AI with extracted content + aiRequest = AiCallRequest( + prompt=f"{prompt}\n\nExtracted Content:\n{contentText}", + context="", + options=options + ) + + aiResponse = await self.callAi(aiRequest) + + # Create response document + resultDocument = DocumentData( + documentName=f"{title or 'extracted_data'}.{outputFormat}", + documentData=aiResponse.content.encode('utf-8') if isinstance(aiResponse.content, str) else aiResponse.content, + mimeType=f"text/{outputFormat}" if outputFormat in ["txt", "json", "csv"] else "application/octet-stream" + ) + + metadata = AiResponseMetadata( + title=title or "Extracted Data", + operationType=OperationTypeEnum.DATA_EXTRACT.value + ) + + self.services.chat.progressLogFinish(extractOperationId, True) + + return AiResponse( + content=aiResponse.content if isinstance(aiResponse.content, str) else aiResponse.content.decode('utf-8', errors='replace'), + metadata=metadata, + documents=[resultDocument] + ) + + except Exception as e: + logger.error(f"Error in data extraction: {str(e)}") + self.services.chat.progressLogFinish(extractOperationId, False) + raise + + async def _handleCodeGeneration( + self, + prompt: str, + options: AiCallOptions, + contentParts: Optional[List[ContentPart]], + outputFormat: str, + title: str, + parentOperationId: Optional[str] + ) -> AiResponse: + """Handle code generation using code generation path.""" + from modules.services.serviceGeneration.paths.codePath import CodeGenerationPath + + codePath = CodeGenerationPath(self.services) + return await codePath.generateCode( + userPrompt=prompt, + outputFormat=outputFormat, + contentParts=contentParts, + title=title or "Generated Code", + parentOperationId=parentOperationId + ) + + async def _handleDocumentGeneration( + self, + prompt: str, + options: AiCallOptions, + documentList: Optional[Any], + documentIntents: Optional[List[DocumentIntent]], + contentParts: Optional[List[ContentPart]], + outputFormat: str, + title: str, + parentOperationId: Optional[str] + ) -> AiResponse: + """Handle document generation using document generation path.""" + from modules.services.serviceGeneration.paths.documentPath import DocumentGenerationPath + + # Set compression options for document generation + options.compressPrompt = False + options.compressContext = False + + documentPath = DocumentGenerationPath(self.services) + return await documentPath.generateDocument( + userPrompt=prompt, + documentList=documentList, + documentIntents=documentIntents, + contentParts=contentParts, + outputFormat=outputFormat, + title=title or "Generated Document", + parentOperationId=parentOperationId + ) + + def _determineDocumentName( self, filledStructure: Dict[str, Any], diff --git a/modules/services/serviceAi/subAiCallLooping.py b/modules/services/serviceAi/subAiCallLooping.py index 6e2c90b5..63b0c806 100644 --- a/modules/services/serviceAi/subAiCallLooping.py +++ b/modules/services/serviceAi/subAiCallLooping.py @@ -16,6 +16,7 @@ from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, Operati from modules.datamodels.datamodelExtraction import ContentPart from modules.shared.jsonUtils import buildContinuationContext, extractJsonString, tryParseJson from modules.services.serviceAi.subJsonResponseHandling import JsonResponseHandler +from modules.services.serviceAi.subLoopingUseCases import LoopingUseCaseRegistry logger = logging.getLogger(__name__) @@ -28,6 +29,7 @@ class AiCallLooper: self.services = services self.aiService = aiService self.responseParser = responseParser + self.useCaseRegistry = LoopingUseCaseRegistry() # Initialize use case registry async def callAiWithLooping( self, @@ -38,7 +40,8 @@ class AiCallLooper: promptArgs: Optional[Dict[str, Any]] = None, operationId: Optional[str] = None, userPrompt: Optional[str] = None, - contentParts: Optional[List[ContentPart]] = None # ARCHITECTURE: Support ContentParts for large content + contentParts: Optional[List[ContentPart]] = None, # ARCHITECTURE: Support ContentParts for large content + useCaseId: str = None # REQUIRED: Explicit use case ID - no auto-detection, no fallback ) -> str: """ Shared core function for AI calls with repair-based looping system. @@ -53,10 +56,31 @@ class AiCallLooper: operationId: Optional operation ID for progress tracking userPrompt: Optional user prompt for KPI definition contentParts: Optional content parts for first iteration + useCaseId: REQUIRED: Explicit use case ID - no auto-detection, no fallback Returns: Complete AI response after all iterations """ + # REQUIRED: useCaseId must be provided - no auto-detection, no fallback + if not useCaseId: + errorMsg = ( + "useCaseId is REQUIRED for callAiWithLooping. " + "No auto-detection - must explicitly specify use case ID. " + f"Available use cases: {list(self.useCaseRegistry.useCases.keys())}" + ) + logger.error(errorMsg) + raise ValueError(errorMsg) + + # Validate use case exists + useCase = self.useCaseRegistry.get(useCaseId) + if not useCase: + errorMsg = ( + f"Use case '{useCaseId}' not found in registry. " + f"Available use cases: {list(self.useCaseRegistry.useCases.keys())}" + ) + logger.error(errorMsg) + raise ValueError(errorMsg) + maxIterations = 50 # Prevent infinite loops iteration = 0 allSections = [] # Accumulate all sections across iterations @@ -199,36 +223,31 @@ class AiCallLooper: # Store raw response for continuation (even if broken) lastRawResponse = result - # Check if this is section content generation (has "elements" not "sections") - # Section content generation returns JSON with "elements" array, not document structure with "sections" - isSectionContentGeneration = False - parsedJsonForSection = None - extractedJsonForSection = None + # Parse JSON for use case handling + parsedJsonForUseCase = None + extractedJsonForUseCase = None + try: - extractedJsonForSection = extractJsonString(result) - parsedJson, parseError, _ = tryParseJson(extractedJsonForSection) + extractedJsonForUseCase = extractJsonString(result) + parsedJson, parseError, _ = tryParseJson(extractedJsonForUseCase) if parseError is None and parsedJson: - parsedJsonForSection = parsedJson - # Check if JSON has "elements" (section content) or "sections" (document structure) - if isinstance(parsedJson, dict): - if "elements" in parsedJson: - isSectionContentGeneration = True - elif isinstance(parsedJson, list) and len(parsedJson) > 0: - # Check if it's a list of elements (section content format) - if isinstance(parsedJson[0], dict) and "type" in parsedJson[0]: - isSectionContentGeneration = True + parsedJsonForUseCase = parsedJson except Exception: pass - if isSectionContentGeneration: - # This is section content generation - return the JSON directly - # No need to extract sections, just return the complete JSON string - logger.info(f"Iteration {iteration}: Section content generation detected (elements found), returning JSON directly") + # Handle use cases that return JSON directly (no section extraction needed) + directReturnUseCases = ["section_content", "chapter_structure", "code_structure", "code_content", "image_batch"] + if useCaseId in directReturnUseCases: + logger.info(f"Iteration {iteration}: Use case '{useCaseId}' - returning JSON directly") if iterationOperationId: self.services.chat.progressLogFinish(iterationOperationId, True) - # Note: Debug files (_prompt and _response) are already written above for iteration 1 - # No need to write _final_result as it's redundant with _response - final_json = json.dumps(parsedJsonForSection, indent=2, ensure_ascii=False) if parsedJsonForSection else (extractedJsonForSection or result) + + final_json = json.dumps(parsedJsonForUseCase, indent=2, ensure_ascii=False) if parsedJsonForUseCase else (extractedJsonForUseCase or result) + + # Write final result for chapter structure and code structure (section_content skips it) + if useCaseId in ["chapter_structure", "code_structure"]: + self.services.utils.writeDebugFile(final_json, f"{debugPrefix}_final_result") + return final_json # Extract sections from response (handles both valid and broken JSON) diff --git a/modules/services/serviceAi/subLoopingUseCases.py b/modules/services/serviceAi/subLoopingUseCases.py new file mode 100644 index 00000000..c52ed1bc --- /dev/null +++ b/modules/services/serviceAi/subLoopingUseCases.py @@ -0,0 +1,231 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +""" +Generic Looping Use Case System + +Provides parametrized looping infrastructure supporting different JSON formats and use cases. +""" + +import logging +from dataclasses import dataclass, field +from typing import Dict, Any, List, Optional, Callable + +logger = logging.getLogger(__name__) + + +@dataclass +class LoopingUseCase: + """Configuration for a specific looping use case.""" + + # Identification + useCaseId: str # "section_content", "chapter_structure", "document_structure", "code_structure", "code_content", "image_batch" + + # JSON Format Detection + jsonTemplate: Dict[str, Any] # Expected JSON structure template + detectionKeys: List[str] # Keys to check for format detection (e.g., ["elements"], ["chapters"], ["files"]) + detectionPath: str # JSONPath to check (e.g., "documents[0].chapters", "files[0].content") + + # Prompt Building + initialPromptBuilder: Optional[Callable] = None # Function to build initial prompt + continuationPromptBuilder: Optional[Callable] = None # Function to build continuation prompt + + # Accumulation & Merging + accumulator: Optional[Callable] = None # Function to accumulate fragments + merger: Optional[Callable] = None # Function to merge accumulated data + + # Continuation Context + continuationContextBuilder: Optional[Callable] = None # Build continuation context for this format + + # Result Building + resultBuilder: Optional[Callable] = None # Build final result from accumulated data + + # Metadata + supportsAccumulation: bool = True # Whether this use case supports accumulation + requiresExtraction: bool = False # Whether this requires extraction (like sections) + + +class LoopingUseCaseRegistry: + """Registry of all looping use cases.""" + + def __init__(self): + self.useCases: Dict[str, LoopingUseCase] = {} + self._registerDefaultUseCases() + + def register(self, useCase: LoopingUseCase): + """Register a new use case.""" + self.useCases[useCase.useCaseId] = useCase + logger.debug(f"Registered looping use case: {useCase.useCaseId}") + + def get(self, useCaseId: str) -> Optional[LoopingUseCase]: + """Get use case by ID.""" + return self.useCases.get(useCaseId) + + def detectUseCase(self, parsedJson: Dict[str, Any]) -> Optional[str]: + """Detect which use case matches the JSON structure.""" + for useCaseId, useCase in self.useCases.items(): + if self._matchesFormat(parsedJson, useCase): + return useCaseId + return None + + def _matchesFormat(self, json: Dict[str, Any], useCase: LoopingUseCase) -> bool: + """Check if JSON matches use case format.""" + # Check top-level keys + for key in useCase.detectionKeys: + if key in json: + return True + + # Check nested path using simple dictionary traversal (no jsonpath_ng needed) + if useCase.detectionPath: + try: + # Simple path matching without jsonpath_ng + # Format: "documents[0].chapters" or "files[0].content" + pathParts = useCase.detectionPath.split(".") + current = json + + for part in pathParts: + # Handle array indices like "documents[0]" + if "[" in part and "]" in part: + key = part.split("[")[0] + index = int(part.split("[")[1].split("]")[0]) + if isinstance(current, dict) and key in current: + if isinstance(current[key], list) and 0 <= index < len(current[key]): + current = current[key][index] + else: + return False + else: + return False + else: + # Regular key access + if isinstance(current, dict) and part in current: + current = current[part] + else: + return False + + # If we successfully traversed the path, it matches + return True + except Exception as e: + logger.debug(f"Path matching failed for {useCase.useCaseId}: {e}") + + return False + + def _registerDefaultUseCases(self): + """Register default use cases.""" + + # Use Case 1: Section Content Generation + # Returns JSON with "elements" array directly + self.register(LoopingUseCase( + useCaseId="section_content", + jsonTemplate={"elements": []}, + detectionKeys=["elements"], + detectionPath="", + initialPromptBuilder=None, # Will use default prompt builder + continuationPromptBuilder=None, # Will use default continuation builder + accumulator=None, # Direct return, no accumulation + merger=None, + continuationContextBuilder=None, # Will use default continuation context + resultBuilder=None, # Return JSON directly + supportsAccumulation=False, + requiresExtraction=False + )) + + # Use Case 2: Chapter Structure Generation + # Returns JSON with "documents[0].chapters" structure + self.register(LoopingUseCase( + useCaseId="chapter_structure", + jsonTemplate={"documents": [{"chapters": []}]}, + detectionKeys=["chapters"], + detectionPath="documents[0].chapters", + initialPromptBuilder=None, + continuationPromptBuilder=None, + accumulator=None, # Direct return, no accumulation + merger=None, + continuationContextBuilder=None, + resultBuilder=None, # Return JSON directly + supportsAccumulation=False, + requiresExtraction=False + )) + + # Use Case 3: Document Structure Generation + # Returns JSON with "documents[0].sections" structure, requires extraction and accumulation + self.register(LoopingUseCase( + useCaseId="document_structure", + jsonTemplate={"documents": [{"sections": []}]}, + detectionKeys=["sections"], + detectionPath="documents[0].sections", + initialPromptBuilder=None, + continuationPromptBuilder=None, + accumulator=None, # Will use default accumulator + merger=None, # Will use default merger + continuationContextBuilder=None, + resultBuilder=None, # Will use default result builder + supportsAccumulation=True, + requiresExtraction=True + )) + + # Use Case 4: Code Structure Generation (NEW) + self.register(LoopingUseCase( + useCaseId="code_structure", + jsonTemplate={ + "metadata": { + "language": "", + "projectType": "single_file|multi_file", + "projectName": "" + }, + "files": [ + { + "id": "", + "filename": "", + "fileType": "", + "dependencies": [], + "imports": [], + "functions": [], + "classes": [] + } + ] + }, + detectionKeys=["files"], + detectionPath="files", + initialPromptBuilder=None, + continuationPromptBuilder=None, + accumulator=None, # Direct return + merger=None, + continuationContextBuilder=None, + resultBuilder=None, + supportsAccumulation=False, + requiresExtraction=False + )) + + # Use Case 5: Code Content Generation (NEW) + self.register(LoopingUseCase( + useCaseId="code_content", + jsonTemplate={"files": [{"content": "", "functions": []}]}, + detectionKeys=["content", "functions"], + detectionPath="files[0].content", + initialPromptBuilder=None, + continuationPromptBuilder=None, + accumulator=None, # Will use default accumulator + merger=None, # Will use default merger + continuationContextBuilder=None, + resultBuilder=None, # Will use default result builder + supportsAccumulation=True, + requiresExtraction=False + )) + + # Use Case 6: Image Batch Generation (NEW) + self.register(LoopingUseCase( + useCaseId="image_batch", + jsonTemplate={"images": []}, + detectionKeys=["images"], + detectionPath="images", + initialPromptBuilder=None, + continuationPromptBuilder=None, + accumulator=None, # Direct return + merger=None, + continuationContextBuilder=None, + resultBuilder=None, + supportsAccumulation=False, + requiresExtraction=False + )) + + logger.info(f"Registered {len(self.useCases)} default looping use cases") + diff --git a/modules/services/serviceAi/subStructureFilling.py b/modules/services/serviceAi/subStructureFilling.py index 138f6572..5a917279 100644 --- a/modules/services/serviceAi/subStructureFilling.py +++ b/modules/services/serviceAi/subStructureFilling.py @@ -23,11 +23,20 @@ logger = logging.getLogger(__name__) class StructureFiller: """Handles filling document structure with content.""" + # Default concurrency limit for parallel generation (chapters/sections) + DEFAULT_MAX_CONCURRENT_GENERATION = 16 + def __init__(self, services, aiService): """Initialize StructureFiller with service center and AI service access.""" self.services = services self.aiService = aiService + def _getMaxConcurrentGeneration(self, options: Optional[AiCallOptions] = None) -> int: + """Get max concurrent generation limit, configurable via options.""" + if options and hasattr(options, 'maxConcurrentGeneration'): + return options.maxConcurrentGeneration + return self.DEFAULT_MAX_CONCURRENT_GENERATION + def _getUserLanguage(self) -> str: """Get user language for document generation""" try: @@ -101,14 +110,19 @@ class StructureFiller: try: filledStructure = copy.deepcopy(structure) + # Get options from AI service if available (for concurrency control) + # Default concurrency limit (16) will be used if options is None + options = None + # Note: Options can be passed via fillStructure if needed in the future + # Phase 5D.1: Sections-Struktur für jedes Chapter generieren filledStructure = await self._generateChapterSectionsStructure( - filledStructure, contentParts, userPrompt, fillOperationId, language + filledStructure, contentParts, userPrompt, fillOperationId, language, options ) # Phase 5D.2: Sections mit ContentParts füllen filledStructure = await self._fillChapterSections( - filledStructure, contentParts, userPrompt, fillOperationId, language + filledStructure, contentParts, userPrompt, fillOperationId, language, options ) # Flattening: Chapters zu Sections konvertieren @@ -243,7 +257,8 @@ class StructureFiller: contentParts: List[ContentPart], userPrompt: str, parentOperationId: str, - language: str + language: str, + options: Optional[AiCallOptions] = None ) -> Dict[str, Any]: """ Phase 5D.1: Generiert Sections-Struktur für jedes Chapter (ohne Content) in parallel. @@ -252,6 +267,10 @@ class StructureFiller: # Count total chapters for progress tracking totalChapters = sum(len(doc.get("chapters", [])) for doc in chapterStructure.get("documents", [])) + # Get concurrency limit + maxConcurrent = self._getMaxConcurrentGeneration(options) + semaphore = asyncio.Semaphore(maxConcurrent) + # Collect all chapters with their indices for parallel processing chapterTasks = [] chapterIndex = 0 @@ -266,25 +285,31 @@ class StructureFiller: contentPartIds = chapter.get("contentPartIds", []) contentPartInstructions = chapter.get("contentPartInstructions", {}) - # Create task for parallel processing - task = self._generateSingleChapterSectionsStructure( - chapter=chapter, - chapterIndex=chapterIndex, - chapterId=chapterId, - chapterLevel=chapterLevel, - chapterTitle=chapterTitle, - generationHint=generationHint, - contentPartIds=contentPartIds, - contentPartInstructions=contentPartInstructions, - contentParts=contentParts, - userPrompt=userPrompt, - language=language, - parentOperationId=parentOperationId, - totalChapters=totalChapters + # Create task for parallel processing with semaphore + async def processChapterWithSemaphore(chapter, chapterIndex, chapterId, chapterLevel, chapterTitle, generationHint, contentPartIds, contentPartInstructions): + async with semaphore: + return await self._generateSingleChapterSectionsStructure( + chapter=chapter, + chapterIndex=chapterIndex, + chapterId=chapterId, + chapterLevel=chapterLevel, + chapterTitle=chapterTitle, + generationHint=generationHint, + contentPartIds=contentPartIds, + contentPartInstructions=contentPartInstructions, + contentParts=contentParts, + userPrompt=userPrompt, + language=language, + parentOperationId=parentOperationId, + totalChapters=totalChapters + ) + + task = processChapterWithSemaphore( + chapter, chapterIndex, chapterId, chapterLevel, chapterTitle, generationHint, contentPartIds, contentPartInstructions ) chapterTasks.append((chapterIndex, chapter, task)) - # Execute all chapter tasks in parallel + # Execute all chapter tasks in parallel with concurrency control if chapterTasks: # Create list of tasks (without indices for gather) tasks = [task for _, _, task in chapterTasks] @@ -386,11 +411,25 @@ class StructureFiller: if generatedElements: elements.extend(generatedElements) else: - # Fallback: Try to parse JSON response directly + # Fallback: Try to parse JSON response directly with repair logic try: - fallbackElements = json.loads( - self.services.utils.jsonExtractString(aiResponse.content) - ) + from modules.shared.jsonUtils import tryParseJson, repairBrokenJson + + # Use tryParseJson which handles extraction and basic parsing + fallbackElements, parseError, cleanedStr = tryParseJson(aiResponse.content) + + # If parsing failed, try repair + if parseError and isinstance(aiResponse.content, str): + logger.warning(f"Initial JSON parse failed for section {sectionId}, attempting repair: {str(parseError)}") + repairedJson = repairBrokenJson(aiResponse.content) + if repairedJson: + fallbackElements = repairedJson + parseError = None + logger.info(f"Successfully repaired JSON for section {sectionId}") + + if parseError: + raise parseError + if isinstance(fallbackElements, list): elements.extend(fallbackElements) elif isinstance(fallbackElements, dict) and "elements" in fallbackElements: @@ -621,7 +660,7 @@ The JSON should be a fragment that can be merged with the previous response.""" processingMode=ProcessingModeEnum.DETAILED ) - aiResponseJson = await self.aiService._callAiWithLooping( + aiResponseJson = await self.aiService.callAiWithLooping( prompt=generationPrompt, options=options, debugPrefix=f"{chapterId}_section_{sectionId}", @@ -638,11 +677,28 @@ The JSON should be a fragment that can be merged with the previous response.""" }, operationId=sectionOperationId, userPrompt=userPrompt, - contentParts=extractedParts + contentParts=extractedParts, + useCaseId="section_content" # REQUIRED: Explicit use case ID ) try: - parsedResponse = json.loads(self.services.utils.jsonExtractString(aiResponseJson)) + # Use tryParseJson which handles extraction and basic parsing + from modules.shared.jsonUtils import tryParseJson, repairBrokenJson + + parsedResponse, parseError, cleanedStr = tryParseJson(aiResponseJson) + + # If parsing failed, try repair + if parseError and isinstance(aiResponseJson, str): + logger.warning(f"Initial JSON parse failed for section {sectionId}, attempting repair: {str(parseError)}") + repairedJson = repairBrokenJson(aiResponseJson) + if repairedJson: + parsedResponse = repairedJson + parseError = None + logger.info(f"Successfully repaired JSON for section {sectionId}") + + if parseError: + raise parseError + if isinstance(parsedResponse, list): generatedElements = parsedResponse elif isinstance(parsedResponse, dict): @@ -824,7 +880,7 @@ The JSON should be a fragment that can be merged with the previous response.""" processingMode=ProcessingModeEnum.DETAILED ) - aiResponseJson = await self.aiService._callAiWithLooping( + aiResponseJson = await self.aiService.callAiWithLooping( prompt=generationPrompt, options=options, debugPrefix=f"{chapterId}_section_{sectionId}", @@ -841,7 +897,8 @@ The JSON should be a fragment that can be merged with the previous response.""" }, operationId=sectionOperationId, userPrompt=userPrompt, - contentParts=[] + contentParts=[], + useCaseId="section_content" # REQUIRED: Explicit use case ID ) try: @@ -1060,7 +1117,7 @@ The JSON should be a fragment that can be merged with the previous response.""" processingMode=ProcessingModeEnum.DETAILED ) - aiResponseJson = await self.aiService._callAiWithLooping( + aiResponseJson = await self.aiService.callAiWithLooping( prompt=generationPrompt, options=options, debugPrefix=f"{chapterId}_section_{sectionId}", @@ -1077,7 +1134,8 @@ The JSON should be a fragment that can be merged with the previous response.""" }, operationId=sectionOperationId, userPrompt=userPrompt, - contentParts=[part] + contentParts=[part], + useCaseId="section_content" # REQUIRED: Explicit use case ID ) try: @@ -1200,7 +1258,8 @@ The JSON should be a fragment that can be merged with the previous response.""" contentParts: List[ContentPart], userPrompt: str, parentOperationId: str, - language: str + language: str, + options: Optional[AiCallOptions] = None ) -> Dict[str, Any]: """ Phase 5D.2: Füllt Sections mit ContentParts. @@ -1217,6 +1276,10 @@ The JSON should be a fragment that can be merged with the previous response.""" totalChapters = sum(len(doc.get("chapters", [])) for doc in chapterStructure.get("documents", [])) fillOperationId = parentOperationId + # Get concurrency limit for sections + maxConcurrent = self._getMaxConcurrentGeneration(options) + sectionSemaphore = asyncio.Semaphore(maxConcurrent) + # Helper function to calculate overall progress def calculateOverallProgress(chapterIndex, totalChapters, sectionIndex, totalSections): """Calculate overall progress: 0.0 to 1.0""" @@ -1251,28 +1314,34 @@ The JSON should be a fragment that can be merged with the previous response.""" parentOperationId=fillOperationId ) - # Process sections within chapter in parallel + # Process sections within chapter in parallel with concurrency control sectionTasks = [] for sectionIndex, section in enumerate(sections): - # Create task for parallel processing - task = self._processSingleSection( - section=section, - sectionIndex=sectionIndex, - totalSections=totalSections, - chapterIndex=chapterIndex, - totalChapters=totalChapters, - chapterId=chapterId, - chapterOperationId=chapterOperationId, - fillOperationId=fillOperationId, - contentParts=contentParts, - userPrompt=userPrompt, - all_sections_list=all_sections_list, - language=language, - calculateOverallProgress=calculateOverallProgress + # Create task wrapper with semaphore for parallel processing + async def processSectionWithSemaphore(section, sectionIndex, totalSections, chapterIndex, totalChapters, chapterId, chapterOperationId, fillOperationId, contentParts, userPrompt, all_sections_list, language, calculateOverallProgress): + async with sectionSemaphore: + return await self._processSingleSection( + section=section, + sectionIndex=sectionIndex, + totalSections=totalSections, + chapterIndex=chapterIndex, + totalChapters=totalChapters, + chapterId=chapterId, + chapterOperationId=chapterOperationId, + fillOperationId=fillOperationId, + contentParts=contentParts, + userPrompt=userPrompt, + all_sections_list=all_sections_list, + language=language, + calculateOverallProgress=calculateOverallProgress + ) + + task = processSectionWithSemaphore( + section, sectionIndex, totalSections, chapterIndex, totalChapters, chapterId, chapterOperationId, fillOperationId, contentParts, userPrompt, all_sections_list, language, calculateOverallProgress ) sectionTasks.append((sectionIndex, section, task)) - # Execute all section tasks in parallel + # Execute all section tasks in parallel with concurrency control if sectionTasks: # Create list of tasks (without indices for gather) tasks = [task for _, _, task in sectionTasks] diff --git a/modules/services/serviceAi/subStructureGeneration.py b/modules/services/serviceAi/subStructureGeneration.py index bee83706..cbabd2fc 100644 --- a/modules/services/serviceAi/subStructureGeneration.py +++ b/modules/services/serviceAi/subStructureGeneration.py @@ -9,9 +9,10 @@ Handles document structure generation, including: """ import json import logging -from typing import Dict, Any, List +from typing import Dict, Any, List, Optional from modules.datamodels.datamodelExtraction import ContentPart +from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum logger = logging.getLogger(__name__) @@ -82,28 +83,89 @@ class StructureGenerator: outputFormat=outputFormat ) - # AI-Call für Chapter-Struktur-Generierung - # Note: Debug logging is handled by callAiPlanning - aiResponse = await self.aiService.callAiPlanning( - prompt=structurePrompt, - debugType="chapter_structure_generation" + # AI-Call für Chapter-Struktur-Generierung mit Looping-Unterstützung + # Use _callAiWithLooping instead of callAiPlanning to support continuation if response is cut + options = AiCallOptions( + operationType=OperationTypeEnum.DATA_GENERATE, + priority=PriorityEnum.QUALITY, + processingMode=ProcessingModeEnum.DETAILED, + compressPrompt=False, + compressContext=False, + resultFormat="json" ) - # Parse Struktur - # Use tryParseJson which handles malformed JSON and unterminated strings - extractedJson = self.services.utils.jsonExtractString(aiResponse) + # Create prompt builder for continuation support + async def buildChapterStructurePromptWithContinuation( + continuationContext: Optional[Dict[str, Any]] = None, + **kwargs + ) -> str: + """Build chapter structure prompt with optional continuation context.""" + basePrompt = self._buildChapterStructurePrompt( + userPrompt=userPrompt, + contentParts=contentParts, + outputFormat=outputFormat + ) + + if continuationContext: + # Add continuation instructions + deliveredSummary = continuationContext.get("delivered_summary", "") + elementBeforeCutoff = continuationContext.get("element_before_cutoff", "") + cutOffElement = continuationContext.get("cut_off_element", "") + + continuationText = f"{deliveredSummary}\n\n" + continuationText += "⚠️ CONTINUATION: Response was cut off. Generate ONLY the remaining content that comes AFTER the reference elements below.\n\n" + + if elementBeforeCutoff: + continuationText += "# REFERENCE: Last complete element (already delivered - DO NOT repeat):\n" + continuationText += f"{elementBeforeCutoff}\n\n" + + if cutOffElement: + continuationText += "# REFERENCE: Incomplete element (cut off here - DO NOT repeat):\n" + continuationText += f"{cutOffElement}\n\n" + + continuationText += "⚠️ CRITICAL: The elements above are REFERENCE ONLY. They are already delivered.\n" + continuationText += "Generate ONLY what comes AFTER these elements. DO NOT regenerate the entire JSON structure.\n" + continuationText += "Start directly with the next chapter that should follow.\n\n" + + return f"""{basePrompt} + +{continuationText} + +Continue generating the remaining chapters now. +""" + else: + return basePrompt + + # Call AI with looping support + aiResponseJson = await self.aiService.callAiWithLooping( + prompt=structurePrompt, + options=options, + debugPrefix="chapter_structure_generation", + promptBuilder=buildChapterStructurePromptWithContinuation, + promptArgs={ + "userPrompt": userPrompt, + "outputFormat": outputFormat, + "services": self.services + }, + useCaseId="chapter_structure", # REQUIRED: Explicit use case ID + operationId=structureOperationId, + userPrompt=userPrompt, + contentParts=contentParts + ) + + # Parse the complete JSON response (looping system already handles completion) + extractedJson = self.services.utils.jsonExtractString(aiResponseJson) parsedJson, parseError, cleanedJson = self.services.utils.jsonTryParse(extractedJson) if parseError is not None: - # Try to repair broken JSON (handles unterminated strings, incomplete structures, etc.) - logger.warning(f"Initial JSON parsing failed: {str(parseError)}. Attempting repair...") + # Even with looping, try repair as fallback + logger.warning(f"JSON parsing failed after looping: {str(parseError)}. Attempting repair...") from modules.shared import jsonUtils repairedJson = jsonUtils.repairBrokenJson(extractedJson) if repairedJson: - # Try parsing repaired JSON parsedJson, parseError, _ = self.services.utils.jsonTryParse(json.dumps(repairedJson)) if parseError is None: - logger.info("Successfully repaired and parsed JSON structure") + logger.info("Successfully repaired and parsed JSON structure after looping") structure = parsedJson else: logger.error(f"Failed to parse repaired JSON: {str(parseError)}") diff --git a/modules/services/serviceExtraction/mainServiceExtraction.py b/modules/services/serviceExtraction/mainServiceExtraction.py index 06877968..618a86e8 100644 --- a/modules/services/serviceExtraction/mainServiceExtraction.py +++ b/modules/services/serviceExtraction/mainServiceExtraction.py @@ -1254,9 +1254,10 @@ class ExtractionService: aiObjects, # Pass interface for AI calls progressCallback=None ) -> AiCallResponse: - """Process content parts with model-aware chunking and AI calls. + """Process content parts with model-aware chunking and AI calls in parallel. Moved from interfaceAiObjects.callWithContentParts() - entry point for content parts processing. + Uses parallel processing similar to section generation for better performance. """ prompt = request.prompt options = request.options @@ -1269,13 +1270,65 @@ class ExtractionService: if not failoverModelList: return self._createErrorResponse("No suitable models found", 0, 0) - # Process each content part + totalParts = len(contentParts) + if totalParts == 0: + return self._createErrorResponse("No content parts to process", 0, 0) + + # Thread-safe counter for progress tracking + completedCount = [0] # Use list to allow modification in nested function + + # Process parts in parallel with concurrency control + maxConcurrent = 5 + if options and hasattr(options, 'maxConcurrentParts'): + maxConcurrent = options.maxConcurrentParts + + semaphore = asyncio.Semaphore(maxConcurrent) + + async def processSinglePart(contentPart, partIndex: int) -> AiCallResponse: + """Process a single content part with progress logging.""" + async with semaphore: + partLabel = contentPart.label or f"Part {partIndex+1}" + partType = contentPart.typeGroup or "unknown" + + # Log start of processing + if progressCallback: + progressCallback(0.1 + (partIndex / totalParts) * 0.8, f"Processing {partLabel} ({partType}) - {partIndex+1}/{totalParts}") + + try: + # Process the part + partResult = await self.processContentPartWithFallback( + contentPart, prompt, options, failoverModelList, aiObjects, None # Don't pass progressCallback to avoid double logging + ) + + # Update completed count and log progress + completedCount[0] += 1 + if progressCallback: + progressCallback(0.1 + (completedCount[0] / totalParts) * 0.8, f"Completed {partLabel} ({partType}) - {completedCount[0]}/{totalParts}") + + return partResult + except Exception as e: + # Update completed count even on error + completedCount[0] += 1 + logger.error(f"Error processing part {partIndex+1} ({partLabel}): {str(e)}") + if progressCallback: + progressCallback(0.1 + (completedCount[0] / totalParts) * 0.8, f"Error processing {partLabel} ({partType}) - {completedCount[0]}/{totalParts}") + # Return error response + return self._createErrorResponse(f"Error processing part: {str(e)}", 0, 0) + + # Create tasks for all parts + tasks = [processSinglePart(contentPart, i) for i, contentPart in enumerate(contentParts)] + + # Execute all tasks in parallel with error handling + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results and handle exceptions allResults = [] - for contentPart in contentParts: - partResult = await self.processContentPartWithFallback( - contentPart, prompt, options, failoverModelList, aiObjects, progressCallback - ) - allResults.append(partResult) + for i, result in enumerate(results): + if isinstance(result, Exception): + logger.error(f"Exception processing part {i+1}: {str(result)}") + allResults.append(self._createErrorResponse(f"Exception: {str(result)}", 0, 0)) + elif result is not None: + allResults.append(result) # Merge all results using unified mergePartResults mergedContent = self.mergePartResults(allResults) diff --git a/modules/services/serviceGeneration/paths/codePath.py b/modules/services/serviceGeneration/paths/codePath.py new file mode 100644 index 00000000..5beb1867 --- /dev/null +++ b/modules/services/serviceGeneration/paths/codePath.py @@ -0,0 +1,584 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +""" +Code Generation Path + +Handles code generation with multi-file project support, dependency handling, +and proper cross-file references. +""" + +import json +import logging +import time +import re +from typing import Dict, Any, List, Optional +from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, DocumentData +from modules.datamodels.datamodelExtraction import ContentPart +from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum + +logger = logging.getLogger(__name__) + + +class CodeGenerationPath: + """Code generation path.""" + + def __init__(self, services): + self.services = services + + async def generateCode( + self, + userPrompt: str, + outputFormat: str = None, + contentParts: Optional[List[ContentPart]] = None, + title: str = "Generated Code", + parentOperationId: Optional[str] = None + ) -> AiResponse: + """ + Generate code files with multi-file project support. + + Returns: AiResponse with code files as documents + """ + # Create operation ID + workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" + codeOperationId = f"code_gen_{workflowId}_{int(time.time())}" + + # Start progress tracking + self.services.chat.progressLogStart( + codeOperationId, + "Code Generation", + "Code Generation", + f"Format: {outputFormat or 'txt'}", + parentOperationId=parentOperationId + ) + + try: + # Detect language and project type from prompt or outputFormat + language, projectType = self._detectLanguageAndProjectType(userPrompt, outputFormat) + + # Phase 1: Code structure generation (with looping) + self.services.chat.progressLogUpdate(codeOperationId, 0.2, "Generating code structure") + codeStructure = await self._generateCodeStructure( + userPrompt=userPrompt, + language=language, + outputFormat=outputFormat, + contentParts=contentParts + ) + + # Phase 2: Code content generation (with dependency handling) + self.services.chat.progressLogUpdate(codeOperationId, 0.5, "Generating code content") + codeFiles = await self._generateCodeContent(codeStructure, codeOperationId) + + # Phase 3: Code formatting & validation + self.services.chat.progressLogUpdate(codeOperationId, 0.9, "Formatting code files") + formattedFiles = await self._formatAndValidateCode(codeFiles) + + # Convert to unified document format + documents = [] + for file in formattedFiles: + mimeType = self._getMimeType(file.get("fileType", outputFormat or "txt")) + content = file.get("content", "") + if isinstance(content, str): + contentBytes = content.encode('utf-8') + else: + contentBytes = content + + documents.append(DocumentData( + documentName=file.get("filename", "generated.txt"), + documentData=contentBytes, + mimeType=mimeType, + sourceJson=file + )) + + metadata = AiResponseMetadata( + title=title, + operationType=OperationTypeEnum.DATA_GENERATE.value + ) + + self.services.chat.progressLogFinish(codeOperationId, True) + + return AiResponse( + documents=documents, + content=None, + metadata=metadata + ) + + except Exception as e: + logger.error(f"Error in code generation: {str(e)}") + self.services.chat.progressLogFinish(codeOperationId, False) + raise + + def _detectLanguageAndProjectType(self, userPrompt: str, outputFormat: Optional[str]) -> tuple: + """Detect programming language and project type from prompt or format.""" + promptLower = userPrompt.lower() + + # Detect language + language = None + if outputFormat: + if outputFormat == "py": + language = "python" + elif outputFormat in ["js", "ts"]: + language = outputFormat + elif outputFormat == "html": + language = "html" + + if not language: + if "python" in promptLower or ".py" in promptLower: + language = "python" + elif "javascript" in promptLower or ".js" in promptLower: + language = "javascript" + elif "typescript" in promptLower or ".ts" in promptLower: + language = "typescript" + elif "html" in promptLower: + language = "html" + else: + language = "python" # Default + + # Detect project type + projectType = "single_file" + if "multi" in promptLower or "multiple files" in promptLower or "project" in promptLower: + projectType = "multi_file" + + return language, projectType + + async def _generateCodeStructure( + self, + userPrompt: str, + language: str, + outputFormat: Optional[str], + contentParts: Optional[List[ContentPart]] + ) -> Dict[str, Any]: + """Generate code structure using looping system.""" + + # Build structure generation prompt + structurePrompt = f"""Analyze the following code generation request and create a project structure. + +Request: {userPrompt} + +Language: {language} + +Create a JSON structure with: +1. metadata: {{"language": "{language}", "projectType": "single_file|multi_file", "projectName": "..."}} +2. files: Array of file structures, each with: + - id: Unique identifier + - filename: File name (e.g., "main.py", "utils.py") + - fileType: File extension (e.g., "py", "js") + - dependencies: List of file IDs this file depends on (for multi-file projects) + - imports: List of import statements (for dependency extraction) + - functions: Array of function signatures {{"name": "...", "signature": "..."}} + - classes: Array of class definitions {{"name": "...", "signature": "..."}} + +For single-file projects, return one file. For multi-file projects, break down into logical modules. + +Return ONLY valid JSON in this format: +{{ + "metadata": {{ + "language": "{language}", + "projectType": "single_file", + "projectName": "generated-project" + }}, + "files": [ + {{ + "id": "file_1", + "filename": "main.py", + "fileType": "py", + "dependencies": [], + "imports": [], + "functions": [], + "classes": [] + }} + ] +}} +""" + + # Use generic looping system with code_structure use case + options = AiCallOptions( + operationType=OperationTypeEnum.DATA_GENERATE, + resultFormat="json" + ) + + structureJson = await self.services.ai.callAiWithLooping( + prompt=structurePrompt, + options=options, + useCaseId="code_structure", + debugPrefix="code_structure_generation", + contentParts=contentParts + ) + + parsed = json.loads(structureJson) + return parsed + + async def _generateCodeContent( + self, + codeStructure: Dict[str, Any], + parentOperationId: str + ) -> List[Dict[str, Any]]: + """Generate code content for each file with dependency handling.""" + files = codeStructure.get("files", []) + metadata = codeStructure.get("metadata", {}) + + if not files: + raise ValueError("No files found in code structure") + + # Step 1: Resolve dependency order + orderedFiles = self._resolveDependencyOrder(files) + + # Step 2: Generate dependency files first (requirements.txt, package.json, etc.) + dependencyFiles = await self._generateDependencyFiles(metadata, orderedFiles) + + # Step 3: Generate code files in dependency order (not fully parallel) + codeFiles = [] + generatedFileContext = {} # Track what's been generated for cross-file references + + for idx, fileStructure in enumerate(orderedFiles): + # Update progress + progress = 0.5 + (0.4 * (idx / len(orderedFiles))) + self.services.chat.progressLogUpdate( + parentOperationId, + progress, + f"Generating {fileStructure.get('filename', 'file')}" + ) + + # Provide context about already-generated files for proper imports + fileContext = self._buildFileContext(generatedFileContext, fileStructure) + + # Generate this file with context + fileContent = await self._generateSingleFileContent( + fileStructure, + fileContext=fileContext, + allFilesStructure=orderedFiles, + metadata=metadata + ) + + codeFiles.append(fileContent) + + # Update context with generated file info (for next files) + generatedFileContext[fileStructure["id"]] = { + "filename": fileContent.get("filename", fileStructure.get("filename")), + "functions": fileContent.get("functions", []), + "classes": fileContent.get("classes", []), + "exports": fileContent.get("exports", []) + } + + # Combine dependency files and code files + return dependencyFiles + codeFiles + + def _resolveDependencyOrder(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Resolve file generation order based on dependencies using topological sort.""" + # Build dependency graph + fileMap = {f["id"]: f for f in files} + dependencies = {} + + for file in files: + fileId = file["id"] + deps = file.get("dependencies", []) # List of file IDs this file depends on + dependencies[fileId] = deps + + # Topological sort + ordered = [] + visited = set() + tempMark = set() + + def visit(fileId: str): + if fileId in tempMark: + # Circular dependency detected - break it + logger.warning(f"Circular dependency detected involving {fileId}") + return + if fileId in visited: + return + + tempMark.add(fileId) + for depId in dependencies.get(fileId, []): + if depId in fileMap: + visit(depId) + tempMark.remove(fileId) + visited.add(fileId) + ordered.append(fileMap[fileId]) + + for file in files: + if file["id"] not in visited: + visit(file["id"]) + + return ordered + + async def _generateDependencyFiles( + self, + metadata: Dict[str, Any], + files: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """Generate dependency files (requirements.txt, package.json, etc.).""" + language = metadata.get("language", "").lower() + dependencyFiles = [] + + # Generate requirements.txt for Python + if language in ["python", "py"]: + requirementsContent = await self._generateRequirementsTxt(files) + if requirementsContent: + dependencyFiles.append({ + "filename": "requirements.txt", + "content": requirementsContent, + "fileType": "txt", + "id": "requirements_txt" + }) + + # Generate package.json for JavaScript/TypeScript + elif language in ["javascript", "typescript", "js", "ts"]: + packageJson = await self._generatePackageJson(files, metadata) + if packageJson: + dependencyFiles.append({ + "filename": "package.json", + "content": json.dumps(packageJson, indent=2), + "fileType": "json", + "id": "package_json" + }) + + return dependencyFiles + + async def _generateRequirementsTxt( + self, + files: List[Dict[str, Any]] + ) -> Optional[str]: + """Generate requirements.txt content from Python imports.""" + pythonPackages = set() + + for file in files: + imports = file.get("imports", []) + if isinstance(imports, list): + for imp in imports: + if isinstance(imp, str): + # Extract package name from import + # Handle: "from flask import", "import flask", "from flask import Flask" + imp = imp.strip() + if "import" in imp: + if "from" in imp: + # "from package import ..." + parts = imp.split("from") + if len(parts) > 1: + package = parts[1].split("import")[0].strip() + if package and not package.startswith("."): + pythonPackages.add(package.split(".")[0]) # Get root package + else: + # "import package" or "import package.module" + parts = imp.split("import") + if len(parts) > 1: + package = parts[1].strip().split(".")[0].strip() + if package and not package.startswith("."): + pythonPackages.add(package) + + if pythonPackages: + return "\n".join(sorted(pythonPackages)) + return None + + async def _generatePackageJson( + self, + files: List[Dict[str, Any]], + metadata: Dict[str, Any] + ) -> Optional[Dict[str, Any]]: + """Generate package.json content from JavaScript/TypeScript imports.""" + npmPackages = {} + + for file in files: + imports = file.get("imports", []) + if isinstance(imports, list): + for imp in imports: + if isinstance(imp, str): + # Extract npm package from import + # Handle: "import express from 'express'", "const express = require('express')" + imp = imp.strip() + if "from" in imp: + # ES6 import: "import ... from 'package'" + parts = imp.split("from") + if len(parts) > 1: + package = parts[1].strip().strip("'\"") + if package and not package.startswith(".") and not package.startswith("/"): + npmPackages[package] = "*" + elif "require" in imp: + # CommonJS: "require('package')" + match = re.search(r"require\(['\"]([^'\"]+)['\"]\)", imp) + if match: + package = match.group(1) + if not package.startswith(".") and not package.startswith("/"): + npmPackages[package] = "*" + + if npmPackages: + return { + "name": metadata.get("projectName", "generated-project"), + "version": "1.0.0", + "dependencies": npmPackages + } + return None + + def _buildFileContext( + self, + generatedFileContext: Dict[str, Dict[str, Any]], + currentFile: Dict[str, Any] + ) -> Dict[str, Any]: + """Build context about other files for proper imports/references.""" + context = { + "availableFiles": [], + "availableFunctions": {}, + "availableClasses": {} + } + + # Add info about already-generated files + for fileId, fileInfo in generatedFileContext.items(): + context["availableFiles"].append({ + "id": fileId, + "filename": fileInfo["filename"], + "functions": fileInfo.get("functions", []), + "classes": fileInfo.get("classes", []), + "exports": fileInfo.get("exports", []) + }) + + # Build function/class maps for easy lookup + for func in fileInfo.get("functions", []): + funcName = func.get("name", "") + if funcName: + context["availableFunctions"][funcName] = { + "file": fileInfo["filename"], + "signature": func.get("signature", "") + } + + for cls in fileInfo.get("classes", []): + className = cls.get("name", "") + if className: + context["availableClasses"][className] = { + "file": fileInfo["filename"] + } + + return context + + async def _generateSingleFileContent( + self, + fileStructure: Dict[str, Any], + fileContext: Dict[str, Any] = None, + allFilesStructure: List[Dict[str, Any]] = None, + metadata: Dict[str, Any] = None + ) -> Dict[str, Any]: + """Generate code content for a single file with context about other files.""" + + # Build prompt with context about other files for proper imports + filename = fileStructure.get("filename", "generated.py") + fileType = fileStructure.get("fileType", "py") + dependencies = fileStructure.get("dependencies", []) + functions = fileStructure.get("functions", []) + classes = fileStructure.get("classes", []) + + contextInfo = "" + if fileContext and fileContext.get("availableFiles"): + contextInfo = "\n\nAvailable files and their exports:\n" + for fileInfo in fileContext["availableFiles"]: + contextInfo += f"- {fileInfo['filename']}: " + funcs = [f.get("name", "") for f in fileInfo.get("functions", [])] + cls = [c.get("name", "") for c in fileInfo.get("classes", [])] + exports = [] + if funcs: + exports.extend(funcs) + if cls: + exports.extend(cls) + if exports: + contextInfo += ", ".join(exports) + contextInfo += "\n" + + contentPrompt = f"""Generate complete, executable code for the file: {filename} + +File Type: {fileType} +Language: {metadata.get('language', 'python') if metadata else 'python'} + +Required functions: +{json.dumps(functions, indent=2) if functions else 'None specified'} + +Required classes: +{json.dumps(classes, indent=2) if classes else 'None specified'} + +Dependencies on other files: {', '.join(dependencies) if dependencies else 'None'} +{contextInfo} + +Generate complete, production-ready code with: +1. Proper imports (including imports from other files in the project if dependencies exist) +2. All required functions and classes +3. Error handling +4. Documentation/docstrings +5. Type hints where appropriate + +Return ONLY valid JSON in this format: +{{ + "files": [ + {{ + "filename": "{filename}", + "content": "// Complete code here", + "functions": {json.dumps(functions, indent=2) if functions else '[]'}, + "classes": {json.dumps(classes, indent=2) if classes else '[]'} + }} + ] +}} +""" + + # Use generic looping system with code_content use case + options = AiCallOptions( + operationType=OperationTypeEnum.DATA_GENERATE, + resultFormat="json" + ) + + contentJson = await self.services.ai.callAiWithLooping( + prompt=contentPrompt, + options=options, + useCaseId="code_content", + debugPrefix=f"code_content_{fileStructure.get('id', 'file')}", + ) + + parsed = json.loads(contentJson) + + # Extract file content and metadata + files = parsed.get("files", []) + if files and len(files) > 0: + fileData = files[0] + return { + "filename": fileData.get("filename", filename), + "content": fileData.get("content", ""), + "fileType": fileType, + "functions": fileData.get("functions", functions), + "classes": fileData.get("classes", classes), + "id": fileStructure.get("id") + } + + # Fallback if structure is different + return { + "filename": filename, + "content": parsed.get("content", ""), + "fileType": fileType, + "functions": functions, + "classes": classes, + "id": fileStructure.get("id") + } + + async def _formatAndValidateCode(self, codeFiles: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Format and validate generated code files.""" + # For now, just return files as-is + # TODO: Add code formatting (black, prettier, etc.) and validation + formatted = [] + for file in codeFiles: + content = file.get("content", "") + # Basic cleanup: remove markdown code fences if present + if isinstance(content, str): + content = re.sub(r'^```[\w]*\n', '', content, flags=re.MULTILINE) + content = re.sub(r'\n```$', '', content, flags=re.MULTILINE) + file["content"] = content.strip() + formatted.append(file) + return formatted + + def _getMimeType(self, fileType: str) -> str: + """Get MIME type for file type.""" + mimeTypes = { + "py": "text/x-python", + "js": "application/javascript", + "ts": "application/typescript", + "html": "text/html", + "css": "text/css", + "json": "application/json", + "txt": "text/plain", + "md": "text/markdown", + "java": "text/x-java-source", + "cpp": "text/x-c++src", + "c": "text/x-csrc" + } + return mimeTypes.get(fileType.lower(), "text/plain") diff --git a/modules/services/serviceGeneration/paths/documentPath.py b/modules/services/serviceGeneration/paths/documentPath.py new file mode 100644 index 00000000..d03c82a0 --- /dev/null +++ b/modules/services/serviceGeneration/paths/documentPath.py @@ -0,0 +1,258 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +""" +Document Generation Path + +Handles document generation using existing chapter/section model. +""" + +import json +import logging +import time +from typing import Dict, Any, List, Optional +from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, DocumentData +from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent +from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum +from modules.datamodels.datamodelDocument import RenderedDocument + +logger = logging.getLogger(__name__) + + +class DocumentGenerationPath: + """Document generation path (existing functionality, refactored).""" + + def __init__(self, services): + self.services = services + + async def generateDocument( + self, + userPrompt: str, + documentList: Optional[Any] = None, # DocumentReferenceList + documentIntents: Optional[List[DocumentIntent]] = None, + contentParts: Optional[List[ContentPart]] = None, + outputFormat: str = "txt", + title: Optional[str] = None, + parentOperationId: Optional[str] = None + ) -> AiResponse: + """ + Generate document using existing chapter/section model. + + Returns: AiResponse with documents list + """ + # Create operation ID + workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" + docOperationId = f"doc_gen_{workflowId}_{int(time.time())}" + + # Start progress tracking + self.services.chat.progressLogStart( + docOperationId, + "Document Generation", + "Document Generation", + f"Format: {outputFormat}", + parentOperationId=parentOperationId + ) + + try: + # Schritt 5A: Kläre Dokument-Intents + documents = [] + if documentList: + documents = self.services.chat.getChatDocumentsFromDocumentList(documentList) + + if not documentIntents and documents: + documentIntents = await self.services.ai.clarifyDocumentIntents( + documents, + userPrompt, + {"outputFormat": outputFormat}, + docOperationId + ) + + # Schritt 5B: Extrahiere und bereite Content vor + if documents: + preparedContentParts = await self.services.ai.extractAndPrepareContent( + documents, + documentIntents or [], + docOperationId + ) + + # Merge mit bereitgestellten contentParts (falls vorhanden) + if contentParts: + # Prüfe auf pre-extracted Content + for part in contentParts: + if part.metadata.get("skipExtraction", False): + # Bereits extrahiert - verwende as-is, stelle sicher dass Metadaten vollständig + part.metadata.setdefault("contentFormat", "extracted") + part.metadata.setdefault("isPreExtracted", True) + preparedContentParts.extend(contentParts) + + contentParts = preparedContentParts + + # Schritt 5B.5: Process contentParts with AI extraction (if provided) + # This extracts text from images, processes content, and updates contentParts with extracted data + # This matches the original flow: extract content first (no AI), then process with AI + if contentParts: + # Filter out binary/other parts that shouldn't be processed + processableParts = [] + skippedParts = [] + for p in contentParts: + if p.typeGroup in ["image", "text", "table", "structure"] or (p.mimeType and (p.mimeType.startswith("image/") or p.mimeType.startswith("text/"))): + processableParts.append(p) + else: + skippedParts.append(p) + + if skippedParts: + logger.debug(f"Skipping {len(skippedParts)} binary/other parts from document generation") + + if processableParts: + # Count images for progress update + imageCount = len([p for p in processableParts if p.typeGroup == "image" or (p.mimeType and p.mimeType.startswith("image/"))]) + if imageCount > 0: + self.services.chat.progressLogUpdate(docOperationId, 0.25, f"Extracting data from {imageCount} images using vision models") + + # Build proper extraction prompt using buildExtractionPrompt + # This creates a focused extraction prompt, not the user's generation prompt + from modules.services.serviceExtraction.subPromptBuilderExtraction import buildExtractionPrompt + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum + + # Determine renderer for format-specific guidelines + renderer = None + if outputFormat: + try: + from modules.services.serviceGeneration.mainServiceGeneration import GenerationService + generationService = GenerationService(self.services) + renderer = generationService.getRendererForFormat(outputFormat) + except Exception as e: + logger.debug(f"Could not get renderer for format {outputFormat}: {e}") + + extractionPrompt = await buildExtractionPrompt( + outputFormat=outputFormat or "txt", + userPrompt=userPrompt, # User's prompt as context for what to extract + title=title or "Document", + aiService=self.services.ai if hasattr(self.services.ai, 'aiObjects') and self.services.ai.aiObjects else None, + services=self.services, + renderer=renderer + ) + + logger.info(f"Processing {len(processableParts)} content parts ({imageCount} images) with extraction prompt") + + # Update progress - starting extraction + self.services.chat.progressLogUpdate(docOperationId, 0.26, f"Starting AI extraction from {len(processableParts)} content parts") + + # Use DATA_EXTRACT operation type for extraction + extractionOptions = AiCallOptions( + operationType=OperationTypeEnum.DATA_EXTRACT, # Use DATA_EXTRACT for extraction + compressPrompt=False, + compressContext=False + ) + + # Create progress callback for per-part progress updates + def extractionProgressCallback(progress: float, message: str): + """Progress callback for extraction - updates parent operation.""" + # Map progress from 0.0-1.0 to 0.26-0.35 range (extraction phase) + mappedProgress = 0.26 + (progress * 0.09) # 0.26 to 0.35 + self.services.chat.progressLogUpdate(docOperationId, mappedProgress, message) + + extractionRequest = AiCallRequest( + prompt=extractionPrompt, # Use proper extraction prompt, not user's generation prompt + context="", + options=extractionOptions, + contentParts=processableParts + ) + + # Write debug file for extraction prompt (all parts) + self.services.utils.writeDebugFile(extractionPrompt, "content_extraction_prompt") + + # Call AI to extract content from contentParts (with progress callback) + extractionResponse = await self.services.ai.callAi(extractionRequest, progressCallback=extractionProgressCallback) + + # Update progress - extraction completed + self.services.chat.progressLogUpdate(docOperationId, 0.35, f"Completed AI extraction from {len(processableParts)} content parts") + + # Write debug file for extraction response + if extractionResponse.content: + self.services.utils.writeDebugFile(extractionResponse.content, "content_extraction_response") + else: + self.services.utils.writeDebugFile(f"Error: No content returned (errorCount={extractionResponse.errorCount})", "content_extraction_response") + logger.warning(f"Content extraction returned no content (errorCount={extractionResponse.errorCount})") + + # Update contentParts with extracted content (matching original flow) + if extractionResponse.errorCount == 0 and extractionResponse.content: + # The extracted content is already merged - update the first processable part with it + # This matches the original behavior where extracted text was used for generation + if processableParts: + # Store extracted content in metadata for use in structure generation + processableParts[0].metadata["extractedContent"] = extractionResponse.content + logger.info(f"Successfully extracted content from {len(processableParts)} parts ({len(extractionResponse.content)} chars)") + else: + # Extraction failed - log warning but continue + logger.warning(f"Content extraction failed, continuing with original contentParts") + + # Schritt 5C: Generiere Struktur + structure = await self.services.ai.generateStructure( + userPrompt, + contentParts or [], + outputFormat, + docOperationId + ) + + # Schritt 5D: Fülle Struktur + # Language will be extracted from services (user intention analysis) in fillStructure + filledStructure = await self.services.ai.fillStructure( + structure, + contentParts or [], + userPrompt, + docOperationId + ) + + # Schritt 5E: Rendere Resultat + # Jedes Dokument wird einzeln gerendert, kann 1..n Dateien zurückgeben (z.B. HTML + Bilder) + renderedDocuments = await self.services.ai.renderResult( + filledStructure, + outputFormat, + title or "Generated Document", + userPrompt, + docOperationId + ) + + # Baue Response: Konvertiere alle gerenderten Dokumente zu DocumentData + documentDataList = [] + for renderedDoc in renderedDocuments: + try: + # Erstelle DocumentData für jedes gerenderte Dokument + docDataObj = DocumentData( + documentName=renderedDoc.filename, + documentData=renderedDoc.documentData, + mimeType=renderedDoc.mimeType, + sourceJson=filledStructure if len(documentDataList) == 0 else None # Nur für erstes Dokument + ) + documentDataList.append(docDataObj) + logger.debug(f"Added rendered document: {renderedDoc.filename} ({len(renderedDoc.documentData)} bytes, {renderedDoc.mimeType})") + except Exception as e: + logger.warning(f"Error creating document {renderedDoc.filename}: {str(e)}") + + if not documentDataList: + raise ValueError("No documents were rendered") + + metadata = AiResponseMetadata( + title=title or filledStructure.get("metadata", {}).get("title", "Generated Document"), + operationType=OperationTypeEnum.DATA_GENERATE.value + ) + + # Debug-Log (harmonisiert) + self.services.utils.writeDebugFile( + json.dumps(filledStructure, indent=2, ensure_ascii=False, default=str), + "document_generation_response" + ) + + self.services.chat.progressLogFinish(docOperationId, True) + + return AiResponse( + content=json.dumps(filledStructure), + metadata=metadata, + documents=documentDataList + ) + + except Exception as e: + logger.error(f"Error in document generation: {str(e)}") + self.services.chat.progressLogFinish(docOperationId, False) + raise + diff --git a/modules/services/serviceGeneration/paths/imagePath.py b/modules/services/serviceGeneration/paths/imagePath.py new file mode 100644 index 00000000..1247494f --- /dev/null +++ b/modules/services/serviceGeneration/paths/imagePath.py @@ -0,0 +1,132 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +""" +Image Generation Path + +Handles image generation with support for single and batch generation. +""" + +import logging +import time +from typing import List, Optional +from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, DocumentData +from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallRequest + +logger = logging.getLogger(__name__) + + +class ImageGenerationPath: + """Image generation path.""" + + def __init__(self, services): + self.services = services + + async def generateImages( + self, + userPrompt: str, + count: int = 1, + style: Optional[str] = None, + format: str = "png", + title: Optional[str] = None, + parentOperationId: Optional[str] = None + ) -> AiResponse: + """ + Generate image files. + + Returns: AiResponse with image files as documents + """ + # Create operation ID + workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" + imageOperationId = f"image_gen_{workflowId}_{int(time.time())}" + + # Start progress tracking + self.services.chat.progressLogStart( + imageOperationId, + "Image Generation", + "Image Generation", + f"Format: {format}", + parentOperationId=parentOperationId + ) + + try: + self.services.chat.progressLogUpdate(imageOperationId, 0.4, "Calling AI for image generation") + + # Build prompt with style if provided + imagePrompt = userPrompt + if style: + imagePrompt = f"{userPrompt}\n\nStyle: {style}" + + # Use IMAGE_GENERATE operation + options = AiCallOptions( + operationType=OperationTypeEnum.IMAGE_GENERATE, + resultFormat=format + ) + + request = AiCallRequest( + prompt=imagePrompt, + context="", + options=options + ) + + response = await self.services.ai.callAi(request) + + if not response.content: + errorMsg = f"No image data returned: {response.content}" + logger.error(f"Error in AI image generation: {errorMsg}") + self.services.chat.progressLogFinish(imageOperationId, False) + raise ValueError(errorMsg) + + # Handle response content (could be base64 string or bytes) + imageData = response.content + if isinstance(imageData, str): + # Assume base64 encoded string + import base64 + try: + imageData = base64.b64decode(imageData) + except Exception: + # If not base64, try encoding as bytes + imageData = imageData.encode('utf-8') + elif not isinstance(imageData, bytes): + imageData = bytes(imageData) + + # Create document + imageDoc = DocumentData( + documentName=f"generated_image.{format}", + documentData=imageData, + mimeType=f"image/{format}" + ) + + metadata = AiResponseMetadata( + title=title or "Generated Image", + operationType=OperationTypeEnum.IMAGE_GENERATE.value + ) + + self.services.chat.storeWorkflowStat( + self.services.workflow, + response, + "ai.generate.image" + ) + + self.services.chat.progressLogUpdate(imageOperationId, 0.9, "Image generated") + self.services.chat.progressLogFinish(imageOperationId, True) + + # Create content string describing the image generation + import json + contentJson = json.dumps({ + "type": "image", + "format": format, + "prompt": userPrompt, + "filename": imageDoc.documentName + }, ensure_ascii=False) + + return AiResponse( + content=contentJson, # JSON string describing the image generation + metadata=metadata, + documents=[imageDoc] + ) + + except Exception as e: + logger.error(f"Error in image generation: {str(e)}") + self.services.chat.progressLogFinish(imageOperationId, False) + raise + diff --git a/modules/workflows/methods/methodAi.py.old b/modules/workflows/methods/methodAi.py.old deleted file mode 100644 index fedaa0ef..00000000 --- a/modules/workflows/methods/methodAi.py.old +++ /dev/null @@ -1,742 +0,0 @@ -# Copyright (c) 2025 Patrick Motsch -# All rights reserved. -""" -AI processing method module. -Handles direct AI calls for any type of task. -""" - -import time -import logging -from typing import Dict, Any, List, Optional -from datetime import datetime, UTC - -from modules.workflows.methods.methodBase import MethodBase, action -from modules.datamodels.datamodelChat import ActionResult, ActionDocument -from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum -from modules.datamodels.datamodelWorkflow import ExtractContentParameters -from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy, ContentPart - -logger = logging.getLogger(__name__) - -class MethodAi(MethodBase): - """AI processing methods.""" - - def __init__(self, services): - super().__init__(services) - self.name = "ai" - self.description = "AI processing methods" - - def _format_timestamp_for_filename(self) -> str: - """Format current timestamp as YYYYMMDD-hhmmss for filenames.""" - return datetime.now(UTC).strftime("%Y%m%d-%H%M%S") - - - @action - async def process(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Universal AI document processing action - accepts MULTIPLE input documents in ANY format (docx, pdf, json, txt, xlsx, html, images, etc.) and processes them together with a prompt to produce MULTIPLE output documents in ANY specified format (via resultType). Use for document generation, format conversion, content transformation, analysis, summarization, translation, extraction, comparison, and any AI-powered document manipulation. - - Input requirements: aiPrompt (required); optional documentList (can contain multiple documents in any format). - - Output format: Multiple documents in the same format per call (via resultType: txt, json, pdf, docx, xlsx, pptx, png, jpg, etc.). The AI can generate multiple files based on the prompt (e.g., "create separate documents for each section"). Default: txt. - - Key capabilities: Can process any number of input documents together, extract data from mixed formats, combine information, generate multiple output files, transform between formats, perform analysis/comparison/summarization on document sets. - - Parameters: - - aiPrompt (str, required): Instruction for the AI describing what processing to perform. - - documentList (list, optional): Document reference(s) in any format to use as input/context. - - resultType (str, optional): Output file extension (txt, json, md, csv, xml, html, pdf, docx, xlsx, png, etc.). All output documents will use this format. Default: txt. - """ - try: - # Init progress logger - workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" - operationId = f"ai_process_{workflowId}_{int(time.time())}" - - # Start progress tracking - parentOperationId = parameters.get('parentOperationId') - self.services.chat.progressLogStart( - operationId, - "Generate", - "AI Processing", - f"Format: {parameters.get('resultType', 'txt')}", - parentOperationId=parentOperationId - ) - - aiPrompt = parameters.get("aiPrompt") - logger.info(f"aiPrompt extracted: '{aiPrompt}' (type: {type(aiPrompt)})") - - # Update progress - preparing parameters - self.services.chat.progressLogUpdate(operationId, 0.2, "Preparing parameters") - - from modules.datamodels.datamodelDocref import DocumentReferenceList - - documentListParam = parameters.get("documentList") - # Convert to DocumentReferenceList if needed - if documentListParam is None: - documentList = DocumentReferenceList(references=[]) - elif isinstance(documentListParam, DocumentReferenceList): - documentList = documentListParam - elif isinstance(documentListParam, str): - documentList = DocumentReferenceList.from_string_list([documentListParam]) - elif isinstance(documentListParam, list): - documentList = DocumentReferenceList.from_string_list(documentListParam) - else: - logger.error(f"Invalid documentList type: {type(documentListParam)}") - documentList = DocumentReferenceList(references=[]) - - resultType = parameters.get("resultType", "txt") - - - if not aiPrompt: - logger.error(f"aiPrompt is missing or empty. Parameters: {parameters}") - return ActionResult.isFailure( - error="AI prompt is required" - ) - - # Determine output extension and default MIME type without duplicating service logic - normalized_result_type = (str(resultType).strip().lstrip('.').lower() or "txt") - output_extension = f".{normalized_result_type}" - output_mime_type = "application/octet-stream" # Prefer service-provided mimeType when available - logger.info(f"Using result type: {resultType} -> {output_extension}") - - # Phase 7.3: Extract content first if documents provided, then use contentParts - # Check if contentParts are already provided (preferred path) - contentParts: Optional[List[ContentPart]] = None - if "contentParts" in parameters: - contentParts = parameters.get("contentParts") - if contentParts and not isinstance(contentParts, list): - # Try to extract from ContentExtracted if it's an ActionDocument - if hasattr(contentParts, 'parts'): - contentParts = contentParts.parts - else: - logger.warning(f"Invalid contentParts type: {type(contentParts)}, treating as empty") - contentParts = None - - # If contentParts not provided but documentList is, extract content first - if not contentParts and documentList.references: - self.services.chat.progressLogUpdate(operationId, 0.3, "Extracting content from documents") - - # Get ChatDocuments - chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList) - if not chatDocuments: - logger.warning("No documents found in documentList") - else: - logger.info(f"Extracting content from {len(chatDocuments)} documents") - - # Prepare extraction options (use defaults if not provided) - extractionOptions = parameters.get("extractionOptions") - if not extractionOptions: - extractionOptions = ExtractionOptions( - prompt="Extract all content from the document", - mergeStrategy=MergeStrategy( - mergeType="concatenate", - groupBy="typeGroup", - orderBy="id" - ), - processDocumentsIndividually=True - ) - - # Extract content using extraction service with hierarchical progress logging - # Pass operationId for per-document progress tracking - extractedResults = self.services.extraction.extractContent(chatDocuments, extractionOptions, operationId=operationId) - - # Combine all ContentParts from all extracted results - contentParts = [] - for extracted in extractedResults: - if extracted.parts: - contentParts.extend(extracted.parts) - - logger.info(f"Extracted {len(contentParts)} content parts from {len(extractedResults)} documents") - - # Update progress - preparing AI call - self.services.chat.progressLogUpdate(operationId, 0.4, "Preparing AI call") - - # Build options with only resultFormat - let service layer handle all other parameters - output_format = output_extension.replace('.', '') or 'txt' - options = AiCallOptions( - resultFormat=output_format - # Removed all model parameters - service layer will analyze prompt and determine optimal parameters - ) - - # Update progress - calling AI - self.services.chat.progressLogUpdate(operationId, 0.6, "Calling AI") - - # Use unified callAiContent method with contentParts (extraction is now separate) - aiResponse = await self.services.ai.callAiContent( - prompt=aiPrompt, - options=options, - contentParts=contentParts, # Already extracted (or None if no documents) - outputFormat=output_format, - parentOperationId=operationId - ) - - # Update progress - processing result - self.services.chat.progressLogUpdate(operationId, 0.8, "Processing result") - - from modules.datamodels.datamodelChat import ActionDocument - - # Extract documents from AiResponse - if aiResponse.documents and len(aiResponse.documents) > 0: - action_documents = [] - for doc in aiResponse.documents: - validationMetadata = { - "actionType": "ai.process", - "resultType": normalized_result_type, - "outputFormat": output_format, - "hasDocuments": True, - "documentCount": len(aiResponse.documents) - } - action_documents.append(ActionDocument( - documentName=doc.documentName, - documentData=doc.documentData, - mimeType=doc.mimeType or output_mime_type, - sourceJson=getattr(doc, 'sourceJson', None), # Preserve source JSON for structure validation - validationMetadata=validationMetadata - )) - - final_documents = action_documents - else: - # Text response - create document from content - extension = output_extension.lstrip('.') - meaningful_name = self._generateMeaningfulFileName( - base_name="ai", - extension=extension, - action_name="result" - ) - validationMetadata = { - "actionType": "ai.process", - "resultType": normalized_result_type, - "outputFormat": output_format, - "hasDocuments": False, - "contentType": "text" - } - action_document = ActionDocument( - documentName=meaningful_name, - documentData=aiResponse.content, - mimeType=output_mime_type, - validationMetadata=validationMetadata - ) - final_documents = [action_document] - - # Complete progress tracking - self.services.chat.progressLogFinish(operationId, True) - - return ActionResult.isSuccess(documents=final_documents) - - except Exception as e: - logger.error(f"Error in AI processing: {str(e)}") - - # Complete progress tracking with failure - try: - self.services.chat.progressLogFinish(operationId, False) - except: - pass # Don't fail on progress logging errors - - return ActionResult.isFailure( - error=str(e) - ) - - - @action - async def webResearch(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Web research with two-step process: search for URLs, then crawl content. - - Input requirements: prompt (required); optional list(url), country, language, researchDepth. - - Output format: JSON with research results including URLs and content. - - Parameters: - - prompt (str, required): Natural language research instruction. - - urlList (list, optional): Specific URLs to crawl, if needed. - - country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de). - - language (str, optional): Language code (lowercase, e.g., de, en, fr). - - researchDepth (str, optional): Research depth - fast, general, or deep. Default: general. - """ - try: - prompt = parameters.get("prompt") - if not prompt: - return ActionResult.isFailure(error="Research prompt is required") - - # Init progress logger - workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" - operationId = f"web_research_{workflowId}_{int(time.time())}" - - # Start progress tracking - parentOperationId = parameters.get('parentOperationId') - self.services.chat.progressLogStart( - operationId, - "Web Research", - "Searching and Crawling", - "Extracting URLs and Content", - parentOperationId=parentOperationId - ) - - # Call webcrawl service - service handles all AI intention analysis and processing - result = await self.services.web.performWebResearch( - prompt=prompt, - urls=parameters.get("urlList", []), - country=parameters.get("country"), - language=parameters.get("language"), - researchDepth=parameters.get("researchDepth", "general"), - operationId=operationId - ) - - # Complete progress tracking - self.services.chat.progressLogFinish(operationId, True) - - # Get meaningful filename from research result (generated by intent analyzer) - suggestedFilename = result.get("suggested_filename") - if suggestedFilename: - # Clean and validate filename - import re - cleaned = suggestedFilename.strip().strip('"\'') - cleaned = cleaned.replace('\n', ' ').replace('\r', ' ').strip() - # Ensure it doesn't already have extension - if cleaned.lower().endswith('.json'): - cleaned = cleaned[:-5] - # Validate: should be reasonable length and contain only safe characters - if cleaned and len(cleaned) <= 60 and re.match(r'^[a-zA-Z0-9_\-]+$', cleaned): - meaningfulName = f"{cleaned}.json" - else: - # Fallback to generic meaningful filename - meaningfulName = self._generateMeaningfulFileName( - base_name="web_research", - extension="json", - action_name="research" - ) - else: - # Fallback to generic meaningful filename - meaningfulName = self._generateMeaningfulFileName( - base_name="web_research", - extension="json", - action_name="research" - ) - - from modules.datamodels.datamodelChat import ActionDocument - validationMetadata = { - "actionType": "ai.webResearch", - "prompt": prompt, - "urlList": parameters.get("urlList", []), - "country": parameters.get("country"), - "language": parameters.get("language"), - "researchDepth": parameters.get("researchDepth", "general"), - "resultFormat": "json" - } - actionDocument = ActionDocument( - documentName=meaningfulName, - documentData=result, - mimeType="application/json", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[actionDocument]) - - except Exception as e: - logger.error(f"Error in web research: {str(e)}") - try: - self.services.chat.progressLogFinish(operationId, False) - except: - pass - return ActionResult.isFailure(error=str(e)) - - - # ============================================================================ - # Document Transformation Wrappers - # ============================================================================ - - @action - async def summarizeDocument(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Summarize one or more documents, extracting key points and main ideas. - - Input requirements: documentList (required); optional summaryLength, focus. - - Output format: Text document with summary (default: txt, can be overridden with resultType). - - Parameters: - - documentList (list, required): Document reference(s) to summarize. - - summaryLength (str, optional): Desired summary length - brief, medium, or detailed. Default: medium. - - focus (str, optional): Specific aspect to focus on in the summary (e.g., "financial data", "key decisions"). - - resultType (str, optional): Output file extension (txt, md, docx, etc.). Default: txt. - """ - documentList = parameters.get("documentList", []) - if not documentList: - return ActionResult.isFailure(error="documentList is required") - - summaryLength = parameters.get("summaryLength", "medium") - focus = parameters.get("focus") - resultType = parameters.get("resultType", "txt") - - lengthInstructions = { - "brief": "Create a brief summary (2-3 paragraphs)", - "medium": "Create a medium-length summary (comprehensive but concise)", - "detailed": "Create a detailed summary covering all major points" - } - lengthInstruction = lengthInstructions.get(summaryLength.lower(), lengthInstructions["medium"]) - - aiPrompt = f"Summarize the provided document(s). {lengthInstruction}." - if focus: - aiPrompt += f" Focus specifically on: {focus}." - aiPrompt += " Extract and present the key points, main ideas, and important information in a clear, well-structured format." - - return await self.process({ - "aiPrompt": aiPrompt, - "documentList": documentList, - "resultType": resultType - }) - - - @action - async def translateDocument(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Translate documents to a target language while preserving formatting and structure. - - Input requirements: documentList (required); targetLanguage (required). - - Output format: Translated document in same format as input (default) or specified resultType. - - Parameters: - - documentList (list, required): Document reference(s) to translate. - - targetLanguage (str, required): Target language code or name (e.g., "de", "German", "French", "es"). - - sourceLanguage (str, optional): Source language if known (e.g., "en", "English"). If not provided, AI will detect. - - preserveFormatting (bool, optional): Whether to preserve original formatting. Default: True. - - resultType (str, optional): Output file extension. If not specified, uses same format as input. - """ - documentList = parameters.get("documentList", []) - if not documentList: - return ActionResult.isFailure(error="documentList is required") - - targetLanguage = parameters.get("targetLanguage") - if not targetLanguage: - return ActionResult.isFailure(error="targetLanguage is required") - - sourceLanguage = parameters.get("sourceLanguage") - preserveFormatting = parameters.get("preserveFormatting", True) - resultType = parameters.get("resultType") - - aiPrompt = f"Translate the provided document(s) to {targetLanguage}." - if sourceLanguage: - aiPrompt += f" The source language is {sourceLanguage}." - if preserveFormatting: - aiPrompt += " Preserve all formatting, structure, tables, and layout exactly as they appear in the original document." - else: - aiPrompt += " Focus on accurate translation of content." - aiPrompt += " Maintain the same document structure, headings, and organization." - - processParams = { - "aiPrompt": aiPrompt, - "documentList": documentList - } - if resultType: - processParams["resultType"] = resultType - - return await self.process(processParams) - - - @action - async def convert(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Convert documents/data between different formats with specific formatting options (e.g., JSON→CSV with custom columns, delimiters). - - Input requirements: documentList (required); inputFormat and outputFormat (required). - - Output format: Document in target format with specified formatting options. - - CRITICAL: If input is already in standardized JSON format, uses automatic rendering system (no AI call needed). - - Parameters: - - documentList (list, required): Document reference(s) to convert. - - inputFormat (str, required): Source format (json, csv, xlsx, txt, etc.). - - outputFormat (str, required): Target format (csv, json, xlsx, txt, etc.). - - columnsPerRow (int, optional): For CSV output, number of columns per row. Default: auto-detect. - - delimiter (str, optional): For CSV output, delimiter character. Default: comma (,). - - includeHeader (bool, optional): For CSV output, whether to include header row. Default: True. - - language (str, optional): Language for output (e.g., 'de', 'en', 'fr'). Default: 'en'. - """ - documentList = parameters.get("documentList", []) - if not documentList: - return ActionResult.isFailure(error="documentList is required") - - inputFormat = parameters.get("inputFormat") - outputFormat = parameters.get("outputFormat") - if not inputFormat or not outputFormat: - return ActionResult.isFailure(error="inputFormat and outputFormat are required") - - # Normalize formats (remove leading dot if present) - normalizedInputFormat = inputFormat.strip().lstrip('.').lower() - normalizedOutputFormat = outputFormat.strip().lstrip('.').lower() - - # Get documents - from modules.datamodels.datamodelDocref import DocumentReferenceList - if isinstance(documentList, DocumentReferenceList): - docRefList = documentList - elif isinstance(documentList, list): - docRefList = DocumentReferenceList.from_string_list(documentList) - else: - docRefList = DocumentReferenceList.from_string_list([documentList]) - - chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docRefList) - if not chatDocuments: - return ActionResult.isFailure(error="No documents found in documentList") - - # Check if input is standardized JSON format - if so, use direct rendering - if normalizedInputFormat == "json" and len(chatDocuments) == 1: - try: - import json - doc = chatDocuments[0] - # ChatDocument doesn't have documentData - need to load file content using fileId - docBytes = self.services.chat.getFileData(doc.fileId) - if not docBytes: - raise ValueError(f"No file data found for fileId={doc.fileId}") - - # Decode bytes to string - docData = docBytes.decode('utf-8') - - # Try to parse as JSON - if isinstance(docData, str): - jsonData = json.loads(docData) - elif isinstance(docData, dict): - jsonData = docData - else: - jsonData = None - - # Check if it's standardized JSON format (has "documents" or "sections") - if jsonData and (isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData)): - # Use direct rendering - no AI call needed! - from modules.services.serviceGeneration.mainServiceGeneration import GenerationService - generationService = GenerationService(self.services) - - # Ensure format is "documents" array - if "documents" not in jsonData: - jsonData = {"documents": [{"sections": jsonData.get("sections", []), "metadata": jsonData.get("metadata", {})}]} - - # Get title - title = jsonData.get("metadata", {}).get("title", doc.documentName or "Converted Document") - - # Render with options - renderOptions = {} - if normalizedOutputFormat == "csv": - renderOptions["delimiter"] = parameters.get("delimiter", ",") - renderOptions["columnsPerRow"] = parameters.get("columnsPerRow") - renderOptions["includeHeader"] = parameters.get("includeHeader", True) - - rendered_content, mime_type = await generationService.renderReport( - jsonData, normalizedOutputFormat, title, None, None - ) - - # Apply CSV options if needed (renderer will handle them) - if normalizedOutputFormat == "csv" and renderOptions: - rendered_content = self._applyCsvOptions(rendered_content, renderOptions) - - from modules.datamodels.datamodelChat import ActionDocument - validationMetadata = { - "actionType": "ai.convert", - "inputFormat": normalizedInputFormat, - "outputFormat": normalizedOutputFormat, - "hasSourceJson": True, - "conversionType": "direct_rendering" - } - actionDoc = ActionDocument( - documentName=f"{doc.documentName.rsplit('.', 1)[0] if '.' in doc.documentName else doc.documentName}.{normalizedOutputFormat}", - documentData=rendered_content, - mimeType=mime_type, - sourceJson=jsonData, # Preserve source JSON for structure validation - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[actionDoc]) - - except Exception as e: - logger.warning(f"Direct rendering failed, falling back to AI conversion: {str(e)}") - # Fall through to AI-based conversion - - # Fallback: Use AI for conversion (for non-JSON inputs or complex conversions) - columnsPerRow = parameters.get("columnsPerRow") - delimiter = parameters.get("delimiter", ",") - includeHeader = parameters.get("includeHeader", True) - language = parameters.get("language", "en") - - aiPrompt = f"Convert the provided document(s) from {normalizedInputFormat.upper()} format to {normalizedOutputFormat.upper()} format." - - if normalizedOutputFormat == "csv": - aiPrompt += f" Use '{delimiter}' as the delimiter character." - if columnsPerRow: - aiPrompt += f" Format the output with {columnsPerRow} columns per row." - if not includeHeader: - aiPrompt += " Do not include a header row." - else: - aiPrompt += " Include a header row with column names." - - if language and language != "en": - aiPrompt += f" Use language: {language}." - - aiPrompt += " Preserve all data and ensure accurate conversion. Maintain data integrity and structure." - - return await self.process({ - "aiPrompt": aiPrompt, - "documentList": documentList, - "resultType": normalizedOutputFormat - }) - - def _applyCsvOptions(self, csvContent: str, options: Dict[str, Any]) -> str: - """Apply CSV formatting options to rendered CSV content.""" - delimiter = options.get("delimiter", ",") - columnsPerRow = options.get("columnsPerRow") - includeHeader = options.get("includeHeader", True) - - # Check if any options need to be applied - needsProcessing = (delimiter != ",") or (columnsPerRow is not None) or (not includeHeader) - - if not needsProcessing: - return csvContent - - import csv - import io - # Re-read CSV with comma, write with new delimiter - reader = csv.reader(io.StringIO(csvContent)) - output = io.StringIO() - writer = csv.writer(output, delimiter=delimiter) - - rows = list(reader) - - # Handle header - if not includeHeader and rows: - rows = rows[1:] # Skip header - - # Handle columnsPerRow - if columnsPerRow: - newRows = [] - for row in rows: - # Split row into chunks of columnsPerRow - for i in range(0, len(row), columnsPerRow): - chunk = row[i:i+columnsPerRow] - # Pad to columnsPerRow if needed - while len(chunk) < columnsPerRow: - chunk.append("") - newRows.append(chunk) - rows = newRows - - for row in rows: - writer.writerow(row) - - return output.getvalue() - - - @action - async def convertDocument(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Convert documents between different formats (PDF→Word, Excel→CSV, etc.). - - Input requirements: documentList (required); targetFormat (required). - - Output format: Document in target format. - - Parameters: - - documentList (list, required): Document reference(s) to convert. - - targetFormat (str, required): Target format extension (docx, pdf, xlsx, csv, txt, html, json, md, etc.). - - preserveStructure (bool, optional): Whether to preserve document structure (headings, tables, etc.). Default: True. - """ - documentList = parameters.get("documentList", []) - if not documentList: - return ActionResult.isFailure(error="documentList is required") - - targetFormat = parameters.get("targetFormat") - if not targetFormat: - return ActionResult.isFailure(error="targetFormat is required") - - preserveStructure = parameters.get("preserveStructure", True) - - # Normalize format (remove leading dot if present) - normalizedFormat = targetFormat.strip().lstrip('.').lower() - - aiPrompt = f"Convert the provided document(s) to {normalizedFormat.upper()} format." - if preserveStructure: - aiPrompt += " Preserve all document structure including headings, tables, formatting, lists, and layout." - aiPrompt += " Ensure the converted document maintains the same content and information as the original." - - return await self.process({ - "aiPrompt": aiPrompt, - "documentList": documentList, - "resultType": normalizedFormat - }) - - - @action - async def extractData(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Extract structured data from documents (key-value pairs, entities, facts, etc.). - - Input requirements: documentList (required); optional dataStructure, fields. - - Output format: JSON by default, or specified resultType. - - Parameters: - - documentList (list, required): Document reference(s) to extract data from. - - dataStructure (str, optional): Desired data structure - flat, nested, or list. Default: nested. - - fields (list, optional): Specific fields/properties to extract (e.g., ["name", "date", "amount"]). - - resultType (str, optional): Output format (json, csv, xlsx, etc.). Default: json. - """ - documentList = parameters.get("documentList", []) - if not documentList: - return ActionResult.isFailure(error="documentList is required") - - dataStructure = parameters.get("dataStructure", "nested") - fields = parameters.get("fields", []) - resultType = parameters.get("resultType", "json") - - aiPrompt = "Extract structured data from the provided document(s)." - if fields: - fieldsStr = ", ".join(fields) - aiPrompt += f" Extract the following specific fields: {fieldsStr}." - else: - aiPrompt += " Extract all relevant data including names, dates, amounts, entities, and key information." - - structureInstructions = { - "flat": "Use a flat key-value structure with simple properties.", - "nested": "Use a nested JSON structure with logical grouping of related data.", - "list": "Structure the data as a list/array of objects, one per entity or record." - } - aiPrompt += f" {structureInstructions.get(dataStructure.lower(), structureInstructions['nested'])}" - - aiPrompt += " Ensure all extracted data is accurate and complete." - - return await self.process({ - "aiPrompt": aiPrompt, - "documentList": documentList, - "resultType": resultType - }) - - - # ============================================================================ - # Content Generation Wrapper - # ============================================================================ - - @action - async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Generate documents from scratch or based on templates/inputs. - - Input requirements: prompt or description (required); optional documentList (for templates/references). - - Output format: Document in specified format (default: docx). - - Parameters: - - prompt (str, required): Description of the document to generate. - - documentList (list, optional): Template documents or reference documents to use as a guide. - - documentType (str, optional): Type of document - letter, memo, proposal, contract, etc. - - resultType (str, optional): Output format (docx, pdf, txt, md, etc.). Default: docx. - """ - prompt = parameters.get("prompt") - if not prompt: - return ActionResult.isFailure(error="prompt is required") - - documentList = parameters.get("documentList", []) - documentType = parameters.get("documentType") - resultType = parameters.get("resultType", "docx") - - aiPrompt = f"Generate a document based on the following requirements: {prompt}" - if documentType: - aiPrompt += f" Document type: {documentType}." - if documentList: - aiPrompt += " Use the provided template/reference documents as a guide for structure, format, and style." - aiPrompt += " Create a professional, well-structured document with appropriate formatting and organization." - - processParams = { - "aiPrompt": aiPrompt, - "resultType": resultType - } - if documentList: - processParams["documentList"] = documentList - - return await self.process(processParams) diff --git a/modules/workflows/methods/methodAi/actions/__init__.py b/modules/workflows/methods/methodAi/actions/__init__.py index 8ebe6679..f0f18286 100644 --- a/modules/workflows/methods/methodAi/actions/__init__.py +++ b/modules/workflows/methods/methodAi/actions/__init__.py @@ -10,6 +10,7 @@ from .summarizeDocument import summarizeDocument from .translateDocument import translateDocument from .convertDocument import convertDocument from .generateDocument import generateDocument +from .generateCode import generateCode __all__ = [ 'process', @@ -18,5 +19,6 @@ __all__ = [ 'translateDocument', 'convertDocument', 'generateDocument', + 'generateCode', ] diff --git a/modules/workflows/methods/methodAi/actions/convertDocument.py b/modules/workflows/methods/methodAi/actions/convertDocument.py index e86b1d5a..9a7522ba 100644 --- a/modules/workflows/methods/methodAi/actions/convertDocument.py +++ b/modules/workflows/methods/methodAi/actions/convertDocument.py @@ -1,31 +1,13 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Convert Document action for AI operations. -Converts documents between different formats (PDF→Word, Excel→CSV, etc.). -""" - import logging from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult logger = logging.getLogger(__name__) -@action async def convertDocument(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Convert documents between different formats (PDF→Word, Excel→CSV, etc.). - - Input requirements: documentList (required); targetFormat (required). - - Output format: Document in target format. - - Parameters: - - documentList (list, required): Document reference(s) to convert. - - targetFormat (str, required): Target format extension (docx, pdf, xlsx, csv, txt, html, json, md, etc.). - - preserveStructure (bool, optional): Whether to preserve document structure (headings, tables, etc.). Default: True. - """ documentList = parameters.get("documentList", []) if not documentList: return ActionResult.isFailure(error="documentList is required") diff --git a/modules/workflows/methods/methodAi/actions/generateCode.py b/modules/workflows/methods/methodAi/actions/generateCode.py new file mode 100644 index 00000000..52e36316 --- /dev/null +++ b/modules/workflows/methods/methodAi/actions/generateCode.py @@ -0,0 +1,135 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. + +import logging +import time +from typing import Dict, Any, Optional, List +from modules.datamodels.datamodelChat import ActionResult, ActionDocument +from modules.datamodels.datamodelExtraction import ContentPart +from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum +from modules.datamodels.datamodelWorkflow import AiResponse, DocumentData + +logger = logging.getLogger(__name__) + +async def generateCode(self, parameters: Dict[str, Any]) -> ActionResult: + prompt = parameters.get("prompt") + if not prompt: + return ActionResult.isFailure(error="prompt is required") + + documentList = parameters.get("documentList", []) + resultType = parameters.get("resultType") + + # Auto-detect format from prompt if not provided + if not resultType: + promptLower = prompt.lower() + if ".html" in promptLower or "html file" in promptLower: + resultType = "html" + elif ".js" in promptLower or "javascript" in promptLower: + resultType = "js" + elif ".py" in promptLower or "python" in promptLower: + resultType = "py" + elif ".ts" in promptLower or "typescript" in promptLower: + resultType = "ts" + elif ".java" in promptLower: + resultType = "java" + elif ".cpp" in promptLower or ".c++" in promptLower: + resultType = "cpp" + else: + resultType = "txt" # Default + + # Create operation ID for progress tracking + workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" + operationId = f"code_gen_{workflowId}_{int(time.time())}" + parentOperationId = parameters.get('parentOperationId') + + try: + # Convert documentList to DocumentReferenceList if needed + docRefList = None + if documentList: + from modules.datamodels.datamodelDocref import DocumentReferenceList + + if isinstance(documentList, DocumentReferenceList): + docRefList = documentList + elif isinstance(documentList, str): + docRefList = DocumentReferenceList.from_string_list([documentList]) + elif isinstance(documentList, list): + docRefList = DocumentReferenceList.from_string_list(documentList) + else: + docRefList = DocumentReferenceList(references=[]) + + # Prepare title + title = "Generated Code" + + # Call AI service with explicit code intent + options = AiCallOptions( + operationType=OperationTypeEnum.DATA_GENERATE, + priority=PriorityEnum.BALANCED, + processingMode=ProcessingModeEnum.DETAILED + ) + + aiResponse: AiResponse = await self.services.ai.callAiContent( + prompt=prompt, + options=options, + documentList=docRefList, + outputFormat=resultType, + title=title, + parentOperationId=parentOperationId, + generationIntent="code" # Explicit intent, skips detection + ) + + # Convert AiResponse to ActionResult + documents = [] + + # Convert DocumentData to ActionDocument + if aiResponse.documents: + for docData in aiResponse.documents: + documents.append(ActionDocument( + documentName=docData.documentName, + documentData=docData.documentData, + mimeType=docData.mimeType, + sourceJson=docData.sourceJson if hasattr(docData, 'sourceJson') else None + )) + + # If no documents but content exists, create a document from content + if not documents and aiResponse.content: + # Determine document name from metadata + docName = f"code.{resultType}" + if aiResponse.metadata and aiResponse.metadata.filename: + docName = aiResponse.metadata.filename + elif aiResponse.metadata and aiResponse.metadata.title: + import re + sanitized = re.sub(r"[^a-zA-Z0-9._-]", "_", aiResponse.metadata.title) + sanitized = re.sub(r"_+", "_", sanitized).strip("_") + if sanitized: + if not sanitized.lower().endswith(f".{resultType}"): + docName = f"{sanitized}.{resultType}" + else: + docName = sanitized + + # Determine mime type + mimeType = "text/plain" + if resultType == "html": + mimeType = "text/html" + elif resultType == "js": + mimeType = "application/javascript" + elif resultType == "py": + mimeType = "text/x-python" + elif resultType == "ts": + mimeType = "application/typescript" + elif resultType == "java": + mimeType = "text/x-java-source" + elif resultType == "cpp": + mimeType = "text/x-c++src" + + documents.append(ActionDocument( + documentName=docName, + documentData=aiResponse.content.encode('utf-8') if isinstance(aiResponse.content, str) else aiResponse.content, + mimeType=mimeType + )) + + return ActionResult.isSuccess(documents=documents) + + except Exception as e: + logger.error(f"Error in code generation: {str(e)}") + return ActionResult.isFailure(error=str(e)) + diff --git a/modules/workflows/methods/methodAi/actions/generateDocument.py b/modules/workflows/methods/methodAi/actions/generateDocument.py index 6569ddab..4e67251b 100644 --- a/modules/workflows/methods/methodAi/actions/generateDocument.py +++ b/modules/workflows/methods/methodAi/actions/generateDocument.py @@ -1,15 +1,9 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Generate Document action for AI operations. -Wrapper around AI service callAiContent method. -""" - import logging import time from typing import Dict, Any, Optional, List -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument from modules.datamodels.datamodelExtraction import ContentPart from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum @@ -17,23 +11,7 @@ from modules.datamodels.datamodelWorkflow import AiResponse, DocumentData logger = logging.getLogger(__name__) -@action async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Generate documents from scratch or based on templates/inputs using hierarchical approach. - - Input requirements: prompt or description (required); optional documentList (for templates/references). - - Output format: Document in specified format. Any format supported by dynamically registered renderers is acceptable (default: txt). - - Parameters: - - prompt (str, required): Description of the document to generate. - - documentList (list, optional): Template documents or reference documents to use as a guide. - - documentType (str, optional): Type of document - letter, memo, proposal, contract, etc. - - resultType (str, optional): Output format. Any format supported by dynamically registered renderers is acceptable (formats are discovered automatically from renderer registry). Common formats: txt, html, pdf, docx, md, json, csv, xlsx, pptx, png, jpg. Default: txt. - - maxSectionLength (int, optional): Maximum words for simple sections. Default: 500. - - parallelGeneration (bool, optional): Enable parallel section generation. Default: True. - - progressLogging (bool, optional): Send ChatLog progress updates. Default: True. - """ prompt = parameters.get("prompt") if not prompt: return ActionResult.isFailure(error="prompt is required") @@ -97,7 +75,8 @@ async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult: documentList=docRefList, # Übergebe documentList direkt - callAiContent macht Phasen 5A-5E outputFormat=resultType, title=title, - parentOperationId=parentOperationId + parentOperationId=parentOperationId, + generationIntent="document" # NEW: Explicit intent, skips detection ) # Convert AiResponse to ActionResult diff --git a/modules/workflows/methods/methodAi/actions/process.py b/modules/workflows/methods/methodAi/actions/process.py index 807c1a64..5f05afed 100644 --- a/modules/workflows/methods/methodAi/actions/process.py +++ b/modules/workflows/methods/methodAi/actions/process.py @@ -1,36 +1,17 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Process action for AI operations. -Universal AI document processing action. -""" - import logging import time import json from typing import Dict, Any, List, Optional -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument from modules.datamodels.datamodelAi import AiCallOptions from modules.datamodels.datamodelExtraction import ContentPart logger = logging.getLogger(__name__) -@action async def process(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Universal AI document processing action - accepts MULTIPLE input documents in ANY format (docx, pdf, json, txt, xlsx, html, images, etc.) and processes them together with a prompt to produce MULTIPLE output documents in ANY specified format (via resultType). Use for document generation, format conversion, content transformation, analysis, summarization, translation, extraction, comparison, and any AI-powered document manipulation. - - Input requirements: aiPrompt (required); optional documentList (can contain multiple documents in any format). - - Output format: Multiple documents in the same format per call (via resultType: txt, json, pdf, docx, xlsx, pptx, png, jpg, etc.). The AI can generate multiple files based on the prompt (e.g., "create separate documents for each section"). Default: txt. - - Key capabilities: Can process any number of input documents together, extract data from mixed formats, combine information, generate multiple output files, transform between formats, perform analysis/comparison/summarization on document sets. - - Parameters: - - aiPrompt (str, required): Instruction for the AI describing what processing to perform. - - documentList (list, optional): Document reference(s) in any format to use as input/context. - - resultType (str, optional): Output file extension (txt, json, md, csv, xml, html, pdf, docx, xlsx, png, etc.). All output documents will use this format. Default: txt. - """ try: # Init progress logger workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" @@ -88,7 +69,8 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult: output_mime_type = "application/octet-stream" # Prefer service-provided mimeType when available logger.info(f"Using result type: {resultType} -> {output_extension}") - # Check if contentParts are already provided (from context.extractContent or other sources) + # Phase 7.3: Extract content first if documents provided, then use contentParts + # Check if contentParts are already provided (preferred path) contentParts: Optional[List[ContentPart]] = None if "contentParts" in parameters: contentParts = parameters.get("contentParts") @@ -100,42 +82,96 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult: logger.warning(f"Invalid contentParts type: {type(contentParts)}, treating as empty") contentParts = None + # If contentParts not provided but documentList is, extract content first + if not contentParts and documentList.references: + self.services.chat.progressLogUpdate(operationId, 0.3, "Extracting content from documents") + + # Get ChatDocuments + chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList) + if not chatDocuments: + logger.warning("No documents found in documentList") + else: + logger.info(f"Extracting content from {len(chatDocuments)} documents") + + # Prepare extraction options (use defaults if not provided) + from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy + extractionOptions = parameters.get("extractionOptions") + if not extractionOptions: + extractionOptions = ExtractionOptions( + prompt="Extract all content from the document", + mergeStrategy=MergeStrategy( + mergeType="concatenate", + groupBy="typeGroup", + orderBy="id" + ), + processDocumentsIndividually=True + ) + + # Extract content using extraction service + extractedResults = self.services.extraction.extractContent(chatDocuments, extractionOptions) + + # Combine all ContentParts from all extracted results + contentParts = [] + for extracted in extractedResults: + if extracted.parts: + contentParts.extend(extracted.parts) + + logger.info(f"Extracted {len(contentParts)} content parts from {len(extractedResults)} documents") + # Update progress - preparing AI call self.services.chat.progressLogUpdate(operationId, 0.4, "Preparing AI call") - # Build options + # Detect image generation from resultType + imageFormats = ["png", "jpg", "jpeg", "gif", "webp"] + isImageGeneration = normalized_result_type in imageFormats + + # Build options with correct operationType output_format = output_extension.replace('.', '') or 'txt' + from modules.datamodels.datamodelAi import OperationTypeEnum options = AiCallOptions( - resultFormat=output_format + resultFormat=output_format, + operationType=OperationTypeEnum.IMAGE_GENERATE if isImageGeneration else OperationTypeEnum.DATA_GENERATE ) + + # Get generationIntent from parameters + generationIntent = parameters.get("generationIntent") + + # For DATA_GENERATE, generationIntent is REQUIRED + # If not provided, default to "document" for document formats (xlsx, docx, pdf, txt, html, etc.) + # This is format-based defaulting, not prompt-based auto-detection + if options.operationType == OperationTypeEnum.DATA_GENERATE and not generationIntent: + # Document formats (default to document generation) + documentFormats = ["xlsx", "docx", "pdf", "txt", "md", "html", "csv", "xml", "json", "pptx"] + # Code formats (should use ai.generateCode instead, but default to code if ai.process is used) + codeFormats = ["py", "js", "ts", "java", "cpp", "c", "go", "rs", "rb", "php", "swift", "kt"] + + if normalized_result_type in documentFormats: + generationIntent = "document" + logger.info(f"Defaulting generationIntent to 'document' for resultType '{normalized_result_type}'") + elif normalized_result_type in codeFormats: + generationIntent = "code" + logger.info(f"Defaulting generationIntent to 'code' for resultType '{normalized_result_type}'") + else: + # Unknown format - default to document (most common use case) + generationIntent = "document" + logger.warning( + f"Unknown resultType '{normalized_result_type}', defaulting generationIntent to 'document'. " + f"For code generation, use ai.generateCode action or explicitly pass generationIntent='code'." + ) # Update progress - calling AI self.services.chat.progressLogUpdate(operationId, 0.6, "Calling AI") - # Use unified callAiContent method - # If contentParts provided (pre-extracted), use them directly - # Otherwise, pass documentList and let callAiContent handle Phases 5A-5E internally - # Note: ContentExtracted documents (from context.extractContent) are now handled - # automatically in _extractAndPrepareContent() (Phase 5B) - if contentParts: - # Pre-extracted ContentParts - use them directly - aiResponse = await self.services.ai.callAiContent( - prompt=aiPrompt, - options=options, - contentParts=contentParts, # Pre-extracted ContentParts - outputFormat=output_format, - parentOperationId=operationId - ) - else: - # Pass documentList - callAiContent handles Phases 5A-5E internally - # This includes automatic detection of ContentExtracted documents - aiResponse = await self.services.ai.callAiContent( - prompt=aiPrompt, - options=options, - documentList=documentList, # callAiContent macht Phasen 5A-5E - outputFormat=output_format, - parentOperationId=operationId - ) + # Use unified callAiContent method with contentParts (extraction is now separate) + # ContentParts are already extracted above (or None if no documents) + aiResponse = await self.services.ai.callAiContent( + prompt=aiPrompt, + options=options, + contentParts=contentParts, # Already extracted (or None if no documents) + outputFormat=output_format, + parentOperationId=operationId, + generationIntent=generationIntent # REQUIRED for DATA_GENERATE + ) # Update progress - processing result self.services.chat.progressLogUpdate(operationId, 0.8, "Processing result") diff --git a/modules/workflows/methods/methodAi/actions/summarizeDocument.py b/modules/workflows/methods/methodAi/actions/summarizeDocument.py index 80588712..619e80c2 100644 --- a/modules/workflows/methods/methodAi/actions/summarizeDocument.py +++ b/modules/workflows/methods/methodAi/actions/summarizeDocument.py @@ -1,32 +1,13 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Summarize Document action for AI operations. -Summarizes one or more documents, extracting key points and main ideas. -""" - import logging from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult logger = logging.getLogger(__name__) -@action async def summarizeDocument(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Summarize one or more documents, extracting key points and main ideas. - - Input requirements: documentList (required); optional summaryLength, focus. - - Output format: Text document with summary (default: txt, can be overridden with resultType). - - Parameters: - - documentList (list, required): Document reference(s) to summarize. - - summaryLength (str, optional): Desired summary length - brief, medium, or detailed. Default: medium. - - focus (str, optional): Specific aspect to focus on in the summary (e.g., "financial data", "key decisions"). - - resultType (str, optional): Output file extension (txt, md, docx, etc.). Default: txt. - """ documentList = parameters.get("documentList", []) if not documentList: return ActionResult.isFailure(error="documentList is required") @@ -50,6 +31,7 @@ async def summarizeDocument(self, parameters: Dict[str, Any]) -> ActionResult: return await self.process({ "aiPrompt": aiPrompt, "documentList": documentList, - "resultType": resultType + "resultType": resultType, + "generationIntent": "document" # NEW: Explicit intent }) diff --git a/modules/workflows/methods/methodAi/actions/translateDocument.py b/modules/workflows/methods/methodAi/actions/translateDocument.py index 12264e39..7388dcc5 100644 --- a/modules/workflows/methods/methodAi/actions/translateDocument.py +++ b/modules/workflows/methods/methodAi/actions/translateDocument.py @@ -1,33 +1,13 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Translate Document action for AI operations. -Translates documents to a target language while preserving formatting and structure. -""" - import logging from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult logger = logging.getLogger(__name__) -@action async def translateDocument(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Translate documents to a target language while preserving formatting and structure. - - Input requirements: documentList (required); targetLanguage (required). - - Output format: Translated document in same format as input (default) or specified resultType. - - Parameters: - - documentList (list, required): Document reference(s) to translate. - - targetLanguage (str, required): Target language code or name (e.g., "de", "German", "French", "es"). - - sourceLanguage (str, optional): Source language if known (e.g., "en", "English"). If not provided, AI will detect. - - preserveFormatting (bool, optional): Whether to preserve original formatting. Default: True. - - resultType (str, optional): Output file extension. If not specified, uses same format as input. - """ documentList = parameters.get("documentList", []) if not documentList: return ActionResult.isFailure(error="documentList is required") @@ -51,7 +31,8 @@ async def translateDocument(self, parameters: Dict[str, Any]) -> ActionResult: processParams = { "aiPrompt": aiPrompt, - "documentList": documentList + "documentList": documentList, + "generationIntent": "document" # NEW: Explicit intent } if resultType: processParams["resultType"] = resultType diff --git a/modules/workflows/methods/methodAi/actions/webResearch.py b/modules/workflows/methods/methodAi/actions/webResearch.py index 2bd5c3dd..62b43bce 100644 --- a/modules/workflows/methods/methodAi/actions/webResearch.py +++ b/modules/workflows/methods/methodAi/actions/webResearch.py @@ -1,35 +1,15 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Web Research action for AI operations. -Web research with two-step process: search for URLs, then crawl content. -""" - import logging import time import re from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def webResearch(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Web research with two-step process: search for URLs, then crawl content. - - Input requirements: prompt (required); optional list(url), country, language, researchDepth. - - Output format: JSON with research results including URLs and content. - - Parameters: - - prompt (str, required): Natural language research instruction. - - urlList (list, optional): Specific URLs to crawl, if needed. - - country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de). - - language (str, optional): Language code (lowercase, e.g., de, en, fr). - - researchDepth (str, optional): Research depth - fast, general, or deep. Default: general. - """ try: prompt = parameters.get("prompt") if not prompt: diff --git a/modules/workflows/methods/methodAi/methodAi.py b/modules/workflows/methods/methodAi/methodAi.py index 881b007d..86efe406 100644 --- a/modules/workflows/methods/methodAi/methodAi.py +++ b/modules/workflows/methods/methodAi/methodAi.py @@ -17,6 +17,7 @@ from .actions.summarizeDocument import summarizeDocument from .actions.translateDocument import translateDocument from .actions.convertDocument import convertDocument from .actions.generateDocument import generateDocument +from .actions.generateCode import generateCode logger = logging.getLogger(__name__) @@ -59,6 +60,14 @@ class MethodAi(MethodBase): required=False, default="txt", description="Output file extension. All output documents will use this format" + ), + "generationIntent": WorkflowActionParameter( + name="generationIntent", + type="str", + frontendType=FrontendType.SELECT, + frontendOptions=["document", "code", "image"], + required=False, + description="Explicit generation intent (\"document\" | \"code\" | \"image\"). For DATA_GENERATE operations, if not provided, defaults based on resultType: document formats (xlsx, docx, pdf, etc.) → \"document\", code formats (py, js, ts, etc.) → \"code\". For IMAGE_GENERATE operations, this parameter is ignored. Best practice: Use qualified actions (ai.generateDocument, ai.generateCode) instead of ai.process." ) }, execute=process.__get__(self, self.__class__) @@ -256,6 +265,35 @@ class MethodAi(MethodBase): ) }, execute=generateDocument.__get__(self, self.__class__) + ), + "generateCode": WorkflowActionDefinition( + actionId="ai.generateCode", + description="Generate code files - explicitly sets intent to 'code'", + parameters={ + "prompt": WorkflowActionParameter( + name="prompt", + type="str", + frontendType=FrontendType.TEXTAREA, + required=True, + description="Description of code to generate" + ), + "documentList": WorkflowActionParameter( + name="documentList", + type="List[str]", + frontendType=FrontendType.DOCUMENT_REFERENCE, + required=False, + description="Reference documents" + ), + "resultType": WorkflowActionParameter( + name="resultType", + type="str", + frontendType=FrontendType.SELECT, + frontendOptions=["py", "js", "ts", "html", "java", "cpp", "txt"], + required=False, + description="Output format (html, js, py, etc.). Default: based on prompt" + ) + }, + execute=generateCode.__get__(self, self.__class__) ) } @@ -269,6 +307,7 @@ class MethodAi(MethodBase): self.translateDocument = translateDocument.__get__(self, self.__class__) self.convertDocument = convertDocument.__get__(self, self.__class__) self.generateDocument = generateDocument.__get__(self, self.__class__) + self.generateCode = generateCode.__get__(self, self.__class__) def _format_timestamp_for_filename(self) -> str: """Format current timestamp as YYYYMMDD-hhmmss for filenames.""" diff --git a/modules/workflows/methods/methodContext.py.old b/modules/workflows/methods/methodContext.py.old deleted file mode 100644 index 0c7e1cae..00000000 --- a/modules/workflows/methods/methodContext.py.old +++ /dev/null @@ -1,460 +0,0 @@ -# Copyright (c) 2025 Patrick Motsch -# All rights reserved. -""" -Context and workflow information method module. -Handles workflow context queries and document indexing. -""" - -import time -import json -import logging -import aiohttp -from typing import Dict, Any, List -from datetime import datetime, UTC - -from modules.workflows.methods.methodBase import MethodBase, action -from modules.datamodels.datamodelChat import ActionResult, ActionDocument -from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy -from modules.shared.configuration import APP_CONFIG - -logger = logging.getLogger(__name__) - -class MethodContext(MethodBase): - """Context and workflow information methods.""" - - def __init__(self, services): - super().__init__(services) - self.name = "context" - self.description = "Context and workflow information methods" - - @action - async def getDocumentIndex(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Generate a comprehensive index of all documents available in the current workflow, including documents from all rounds and tasks. - - Input requirements: No input documents required. Optional resultType parameter. - - Output format: Structured document index in JSON format (default) or text format, listing all documents with their references, metadata, and organization by rounds/tasks. - - Parameters: - - resultType (str, optional): Output format (json, txt, md). Default: json. - """ - try: - workflow = self.services.workflow - if not workflow: - return ActionResult.isFailure( - error="No workflow available" - ) - - resultType = parameters.get("resultType", "json").lower().strip().lstrip('.') - - # Get available documents index from chat service - documentsIndex = self.services.chat.getAvailableDocuments(workflow) - - if not documentsIndex or documentsIndex == "No documents available" or documentsIndex == "NO DOCUMENTS AVAILABLE - This workflow has no documents to process.": - # Return empty index structure - if resultType == "json": - indexData = { - "workflowId": getattr(workflow, 'id', 'unknown'), - "totalDocuments": 0, - "rounds": [], - "documentReferences": [] - } - indexContent = json.dumps(indexData, indent=2, ensure_ascii=False) - else: - indexContent = "Document Index\n==============\n\nNo documents available in this workflow.\n" - else: - # Parse the document index string to extract structured information - indexData = self._parseDocumentIndex(documentsIndex, workflow) - - if resultType == "json": - indexContent = json.dumps(indexData, indent=2, ensure_ascii=False) - elif resultType == "md": - indexContent = self._formatAsMarkdown(indexData) - else: # txt - indexContent = self._formatAsText(indexData, documentsIndex) - - # Generate meaningful filename - workflowContext = self.services.chat.getWorkflowContext() - filename = self._generateMeaningfulFileName( - "document_index", - resultType if resultType in ["json", "txt", "md"] else "json", - workflowContext, - "getDocumentIndex" - ) - - validationMetadata = { - "actionType": "context.getDocumentIndex", - "resultType": resultType, - "workflowId": getattr(workflow, 'id', 'unknown'), - "totalDocuments": indexData.get("totalDocuments", 0) if isinstance(indexData, dict) else 0 - } - - # Create ActionDocument - document = ActionDocument( - documentName=filename, - documentData=indexContent, - mimeType="application/json" if resultType == "json" else "text/plain", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - logger.error(f"Error generating document index: {str(e)}") - return ActionResult.isFailure( - error=f"Failed to generate document index: {str(e)}" - ) - - def _parseDocumentIndex(self, documentsIndex: str, workflow: Any) -> Dict[str, Any]: - """Parse the document index string into structured data.""" - try: - indexData = { - "workflowId": getattr(workflow, 'id', 'unknown'), - "generatedAt": datetime.now(UTC).isoformat(), - "totalDocuments": 0, - "rounds": [], - "documentReferences": [] - } - - # Extract document references from the index string - lines = documentsIndex.split('\n') - currentRound = None - currentDocList = None - - for line in lines: - line = line.strip() - if not line: - continue - - # Check for round headers - if "Current round documents:" in line: - currentRound = "current" - continue - elif "Past rounds documents:" in line: - currentRound = "past" - continue - - # Check for document list references (docList:...) - if line.startswith("- docList:"): - docListRef = line.replace("- docList:", "").strip() - currentDocList = { - "reference": docListRef, - "round": currentRound, - "documents": [] - } - indexData["rounds"].append(currentDocList) - continue - - # Check for individual document references (docItem:...) - if line.startswith(" - docItem:") or line.startswith("- docItem:"): - docItemRef = line.replace(" - docItem:", "").replace("- docItem:", "").strip() - indexData["documentReferences"].append({ - "reference": docItemRef, - "round": currentRound, - "docList": currentDocList["reference"] if currentDocList else None - }) - indexData["totalDocuments"] += 1 - if currentDocList: - currentDocList["documents"].append(docItemRef) - - return indexData - - except Exception as e: - logger.error(f"Error parsing document index: {str(e)}") - return { - "workflowId": getattr(workflow, 'id', 'unknown'), - "error": f"Failed to parse document index: {str(e)}", - "rawIndex": documentsIndex - } - - def _formatAsMarkdown(self, indexData: Dict[str, Any]) -> str: - """Format document index as Markdown.""" - try: - md = f"# Document Index\n\n" - md += f"**Workflow ID:** {indexData.get('workflowId', 'unknown')}\n\n" - md += f"**Generated At:** {indexData.get('generatedAt', 'unknown')}\n\n" - md += f"**Total Documents:** {indexData.get('totalDocuments', 0)}\n\n" - - if indexData.get('rounds'): - md += "## Documents by Round\n\n" - for roundInfo in indexData['rounds']: - roundLabel = roundInfo.get('round', 'unknown').title() - md += f"### {roundLabel} Round\n\n" - md += f"**Document List:** `{roundInfo.get('reference', 'unknown')}`\n\n" - if roundInfo.get('documents'): - md += "**Documents:**\n\n" - for docRef in roundInfo['documents']: - md += f"- `{docRef}`\n" - md += "\n" - - if indexData.get('documentReferences'): - md += "## All Document References\n\n" - for docRef in indexData['documentReferences']: - md += f"- `{docRef.get('reference', 'unknown')}`\n" - - return md - - except Exception as e: - logger.error(f"Error formatting as Markdown: {str(e)}") - return f"# Document Index\n\nError formatting index: {str(e)}\n" - - def _formatAsText(self, indexData: Dict[str, Any], rawIndex: str) -> str: - """Format document index as plain text.""" - try: - text = "Document Index\n" - text += "=" * 50 + "\n\n" - text += f"Workflow ID: {indexData.get('workflowId', 'unknown')}\n" - text += f"Generated At: {indexData.get('generatedAt', 'unknown')}\n" - text += f"Total Documents: {indexData.get('totalDocuments', 0)}\n\n" - - # Include the raw formatted index for readability - text += rawIndex - - return text - - except Exception as e: - logger.error(f"Error formatting as text: {str(e)}") - return f"Document Index\n\nError formatting index: {str(e)}\n\nRaw index:\n{rawIndex}\n" - - @action - async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Extract content from documents (separate from AI calls). - - This action performs pure content extraction without AI processing. - The extracted ContentParts can then be used by subsequent AI processing actions. - - Parameters: - - documentList (list, required): Document reference(s) to extract content from. - - extractionOptions (dict, optional): Extraction options (if not provided, defaults are used). - - Returns: - - ActionResult with ActionDocument containing ContentExtracted objects - - ContentExtracted.parts contains List[ContentPart] (already chunked if needed) - """ - try: - # Init progress logger - workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" - operationId = f"context_extract_{workflowId}_{int(time.time())}" - - # Extract documentList from parameters dict - from modules.datamodels.datamodelDocref import DocumentReferenceList - documentListParam = parameters.get("documentList") - if not documentListParam: - return ActionResult.isFailure(error="documentList is required") - - # Convert to DocumentReferenceList if needed - if isinstance(documentListParam, DocumentReferenceList): - documentList = documentListParam - elif isinstance(documentListParam, str): - documentList = DocumentReferenceList.from_string_list([documentListParam]) - elif isinstance(documentListParam, list): - documentList = DocumentReferenceList.from_string_list(documentListParam) - else: - return ActionResult.isFailure(error=f"Invalid documentList type: {type(documentListParam)}") - - # Start progress tracking - parentOperationId = parameters.get('parentOperationId') - self.services.chat.progressLogStart( - operationId, - "Extracting content from documents", - "Content Extraction", - f"Documents: {len(documentList.references)}", - parentOperationId=parentOperationId - ) - - # Get ChatDocuments from documentList - self.services.chat.progressLogUpdate(operationId, 0.2, "Loading documents") - chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList) - - if not chatDocuments: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No documents found in documentList") - - logger.info(f"Extracting content from {len(chatDocuments)} documents") - - # Prepare extraction options - self.services.chat.progressLogUpdate(operationId, 0.3, "Preparing extraction options") - extractionOptionsParam = parameters.get("extractionOptions") - - # Convert dict to ExtractionOptions object if needed, or create defaults - if extractionOptionsParam: - if isinstance(extractionOptionsParam, dict): - # Convert dict to ExtractionOptions object - extractionOptions = ExtractionOptions(**extractionOptionsParam) - elif isinstance(extractionOptionsParam, ExtractionOptions): - extractionOptions = extractionOptionsParam - else: - # Invalid type, use defaults - extractionOptions = None - else: - extractionOptions = None - - # If extractionOptions not provided, create defaults - if not extractionOptions: - # Default extraction options for pure content extraction (no AI processing) - extractionOptions = ExtractionOptions( - prompt="Extract all content from the document", - mergeStrategy=MergeStrategy( - mergeType="concatenate", - groupBy="typeGroup", - orderBy="id" - ), - processDocumentsIndividually=True - ) - - # Call extraction service with hierarchical progress logging - self.services.chat.progressLogUpdate(operationId, 0.4, "Initiating") - self.services.chat.progressLogUpdate(operationId, 0.5, f"Extracting content from {len(chatDocuments)} documents") - # Pass operationId for hierarchical per-document progress logging - extractedResults = self.services.extraction.extractContent(chatDocuments, extractionOptions, operationId=operationId) - - # Build ActionDocuments from ContentExtracted results - self.services.chat.progressLogUpdate(operationId, 0.8, "Building result documents") - actionDocuments = [] - # Map extracted results back to original documents by index (results are in same order) - for i, extracted in enumerate(extractedResults): - # Get original document name if available - originalDoc = chatDocuments[i] if i < len(chatDocuments) else None - if originalDoc and hasattr(originalDoc, 'fileName') and originalDoc.fileName: - # Use original filename with "extracted_" prefix - baseName = originalDoc.fileName.rsplit('.', 1)[0] if '.' in originalDoc.fileName else originalDoc.fileName - documentName = f"{baseName}_extracted_{extracted.id}.json" - else: - # Fallback to generic name with index - documentName = f"document_{i+1:03d}_extracted_{extracted.id}.json" - - # Store ContentExtracted object in ActionDocument.documentData - validationMetadata = { - "actionType": "context.extractContent", - "documentIndex": i, - "extractedId": extracted.id, - "partCount": len(extracted.parts) if extracted.parts else 0, - "originalFileName": originalDoc.fileName if originalDoc and hasattr(originalDoc, 'fileName') else None - } - actionDoc = ActionDocument( - documentName=documentName, - documentData=extracted, # ContentExtracted object - mimeType="application/json", - validationMetadata=validationMetadata - ) - actionDocuments.append(actionDoc) - - self.services.chat.progressLogFinish(operationId, True) - - return ActionResult.isSuccess(documents=actionDocuments) - - except Exception as e: - logger.error(f"Error in content extraction: {str(e)}") - - # Complete progress tracking with failure - try: - self.services.chat.progressLogFinish(operationId, False) - except: - pass # Don't fail on progress logging errors - - return ActionResult.isFailure(error=str(e)) - - @action - async def triggerPreprocessingServer(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Trigger preprocessing server at customer tenant to update database with configuration. - - This action makes a POST request to the preprocessing server endpoint with the provided - configuration JSON. The authorization secret is retrieved from APP_CONFIG using the provided config key. - - Parameters: - - endpoint (str, required): The full URL endpoint for the preprocessing server API. - - configJson (dict or str, required): Configuration JSON object to send to the preprocessing server. Can be provided as a dict or as a JSON string that will be parsed. - - authSecretConfigKey (str, required): The APP_CONFIG key name to retrieve the authorization secret from. - - Returns: - - ActionResult with ActionDocument containing "ok" on success, or error message on failure. - """ - try: - endpoint = parameters.get("endpoint") - if not endpoint: - return ActionResult.isFailure(error="endpoint parameter is required") - - configJsonParam = parameters.get("configJson") - if not configJsonParam: - return ActionResult.isFailure(error="configJson parameter is required") - - authSecretConfigKey = parameters.get("authSecretConfigKey") - if not authSecretConfigKey: - return ActionResult.isFailure(error="authSecretConfigKey parameter is required") - - # Handle configJson as either dict or JSON string - if isinstance(configJsonParam, str): - try: - configJson = json.loads(configJsonParam) - except json.JSONDecodeError as e: - return ActionResult.isFailure(error=f"configJson is not valid JSON: {str(e)}") - elif isinstance(configJsonParam, dict): - configJson = configJsonParam - else: - return ActionResult.isFailure(error=f"configJson must be a dict or JSON string, got {type(configJsonParam)}") - - # Get authorization secret from APP_CONFIG using the provided config key - authSecret = APP_CONFIG.get(authSecretConfigKey) - if not authSecret: - errorMsg = f"{authSecretConfigKey} not found in APP_CONFIG" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - # Prepare headers with authorization (default headers as in original function) - headers = { - "X-PP-API-Key": authSecret, - "Content-Type": "application/json" - } - - # Make POST request - timeout = aiohttp.ClientTimeout(total=60) - async with aiohttp.ClientSession(timeout=timeout) as session: - async with session.post( - endpoint, - headers=headers, - json=configJson - ) as response: - if response.status in [200, 201]: - responseText = await response.text() - logger.info(f"Preprocessing server trigger successful: {response.status}") - logger.debug(f"Response: {responseText}") - - # Generate meaningful filename - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "preprocessing_result", - "txt", - workflowContext, - "triggerPreprocessingServer" - ) - - # Create validation metadata - validationMetadata = self._createValidationMetadata( - "triggerPreprocessingServer", - endpoint=endpoint, - statusCode=response.status, - responseText=responseText - ) - - # Return success with "ok" document - document = ActionDocument( - documentName=filename, - documentData="ok", - mimeType="text/plain", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - else: - errorText = await response.text() - errorMsg = f"Preprocessing server trigger failed: {response.status} - {errorText}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - except Exception as e: - errorMsg = f"Error triggering preprocessing server: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - diff --git a/modules/workflows/methods/methodContext/actions/extractContent.py b/modules/workflows/methods/methodContext/actions/extractContent.py index 949ac63d..5b90ce13 100644 --- a/modules/workflows/methods/methodContext/actions/extractContent.py +++ b/modules/workflows/methods/methodContext/actions/extractContent.py @@ -1,49 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Extract Content action for Context operations. -Extracts content from documents (separate from AI calls). -""" - import logging import time from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument from modules.datamodels.datamodelDocref import DocumentReferenceList from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy, ContentExtracted, ContentPart logger = logging.getLogger(__name__) -@action async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Extract raw content parts from documents without AI processing. - - This action performs pure content extraction WITHOUT AI/OCR processing. - It returns ContentParts with different typeGroups: - - "text": Extracted text from text-based formats (PDF text layers, Word docs, etc.) - - "image": Images as base64-encoded data (NOT converted to text, no OCR) - - "table": Tables as structured data - - "structure": Structured content (JSON, etc.) - - "container": Container elements (PDF pages, etc.) - - IMPORTANT: - - Images are returned as base64 data, NOT as extracted text - - No OCR is performed - images are preserved as visual elements - - Text extraction only works for text-based formats (not images) - - The extracted ContentParts can then be used by subsequent AI processing actions - - Parameters: - - documentList (list, required): Document reference(s) to extract content from. - - extractionOptions (dict, optional): Extraction options (if not provided, defaults are used). - - Returns: - - ActionResult with ActionDocument containing ContentExtracted objects - - ContentExtracted.parts contains List[ContentPart] with various typeGroups - - Each ContentPart has a typeGroup indicating its type (text, image, table, etc.) - """ try: # Init progress logger workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" diff --git a/modules/workflows/methods/methodContext/actions/getDocumentIndex.py b/modules/workflows/methods/methodContext/actions/getDocumentIndex.py index 6c9a6700..9991285b 100644 --- a/modules/workflows/methods/methodContext/actions/getDocumentIndex.py +++ b/modules/workflows/methods/methodContext/actions/getDocumentIndex.py @@ -1,30 +1,14 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Get Document Index action for Context operations. -Generates a comprehensive index of all documents available in the current workflow. -""" - import logging import json from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def getDocumentIndex(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Generate a comprehensive index of all documents available in the current workflow, including documents from all rounds and tasks. - - Input requirements: No input documents required. Optional resultType parameter. - - Output format: Structured document index in JSON format (default) or text format, listing all documents with their references, metadata, and organization by rounds/tasks. - - Parameters: - - resultType (str, optional): Output format (json, txt, md). Default: json. - """ try: workflow = self.services.workflow if not workflow: diff --git a/modules/workflows/methods/methodContext/actions/neutralizeData.py b/modules/workflows/methods/methodContext/actions/neutralizeData.py index 240fe6b1..8e3b7185 100644 --- a/modules/workflows/methods/methodContext/actions/neutralizeData.py +++ b/modules/workflows/methods/methodContext/actions/neutralizeData.py @@ -1,35 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Neutralize Data action for Context operations. -Neutralizes extracted content data from ContentExtracted documents. -""" - import logging import time from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument from modules.datamodels.datamodelDocref import DocumentReferenceList from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart logger = logging.getLogger(__name__) -@action async def neutralizeData(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Neutralize data from ContentExtracted documents. - - This action takes documents containing ContentExtracted objects (from extractContent) - and neutralizes the text data in ContentPart.data fields. - - Parameters: - - documentList (list, required): Document reference(s) containing ContentExtracted objects. - - Returns: - - ActionResult with ActionDocument containing neutralized ContentExtracted objects - """ try: # Init progress logger workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" diff --git a/modules/workflows/methods/methodContext/actions/triggerPreprocessingServer.py b/modules/workflows/methods/methodContext/actions/triggerPreprocessingServer.py index 7ef16d5f..2f011a25 100644 --- a/modules/workflows/methods/methodContext/actions/triggerPreprocessingServer.py +++ b/modules/workflows/methods/methodContext/actions/triggerPreprocessingServer.py @@ -1,37 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Trigger Preprocessing Server action for Context operations. -Triggers preprocessing server at customer tenant to update database with configuration. -""" - import logging import json import aiohttp from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument from modules.shared.configuration import APP_CONFIG logger = logging.getLogger(__name__) -@action async def triggerPreprocessingServer(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Trigger preprocessing server at customer tenant to update database with configuration. - - This action makes a POST request to the preprocessing server endpoint with the provided - configuration JSON. The authorization secret is retrieved from APP_CONFIG using the provided config key. - - Parameters: - - endpoint (str, required): The full URL endpoint for the preprocessing server API. - - configJson (dict or str, required): Configuration JSON object to send to the preprocessing server. Can be provided as a dict or as a JSON string that will be parsed. - - authSecretConfigKey (str, required): The APP_CONFIG key name to retrieve the authorization secret from. - - Returns: - - ActionResult with ActionDocument containing "ok" on success, or error message on failure. - """ try: endpoint = parameters.get("endpoint") if not endpoint: diff --git a/modules/workflows/methods/methodJira.py.old b/modules/workflows/methods/methodJira.py.old deleted file mode 100644 index 2be46c1f..00000000 --- a/modules/workflows/methods/methodJira.py.old +++ /dev/null @@ -1,1101 +0,0 @@ -# Copyright (c) 2025 Patrick Motsch -# All rights reserved. -""" -JIRA operations method module. -Handles JIRA ticket operations including connection, export, import, and data processing. -""" - -import logging -import json -import io -import pandas as pd -import csv as csv_module -from io import StringIO, BytesIO -from typing import Dict, Any, List, Optional -from datetime import datetime, UTC - -from modules.workflows.methods.methodBase import MethodBase, action -from modules.datamodels.datamodelChat import ActionResult, ActionDocument -from modules.datamodels.datamodelDocref import DocumentReferenceList -from modules.shared.configuration import APP_CONFIG - -logger = logging.getLogger(__name__) - -class MethodJira(MethodBase): - """JIRA operations methods.""" - - def __init__(self, services): - super().__init__(services) - self.name = "jira" - self.description = "JIRA operations methods" - # Store connections in memory (keyed by connectionId) - self._connections: Dict[str, Any] = {} - - def _convertAdfToText(self, adfData): - """Convert Atlassian Document Format (ADF) to plain text. - - Based on Atlassian Document Format specification for JIRA fields. - Handles paragraphs, lists, text formatting, and other ADF node types. - - Args: - adfData: ADF object or None - - Returns: - str: Plain text content, or empty string if None/invalid - """ - if not adfData or not isinstance(adfData, dict): - return "" - - if adfData.get("type") != "doc": - return str(adfData) if adfData else "" - - content = adfData.get("content", []) - if not isinstance(content, list): - return "" - - def extractTextFromContent(contentList, listLevel=0): - """Recursively extract text from ADF content with proper formatting.""" - textParts = [] - listCounter = 1 - - for item in contentList: - if not isinstance(item, dict): - continue - - itemType = item.get("type", "") - - if itemType == "text": - # Extract text content, preserving formatting - text = item.get("text", "") - marks = item.get("marks", []) - - # Handle text formatting (bold, italic, etc.) - if marks: - for mark in marks: - if mark.get("type") == "strong": - text = f"**{text}**" - elif mark.get("type") == "em": - text = f"*{text}*" - elif mark.get("type") == "code": - text = f"`{text}`" - elif mark.get("type") == "link": - attrs = mark.get("attrs", {}) - href = attrs.get("href", "") - if href: - text = f"[{text}]({href})" - - textParts.append(text) - - elif itemType == "hardBreak": - textParts.append("\n") - - elif itemType == "paragraph": - paragraphContent = item.get("content", []) - if paragraphContent: - paragraphText = extractTextFromContent(paragraphContent, listLevel) - if paragraphText.strip(): - textParts.append(paragraphText) - - elif itemType == "bulletList": - listContent = item.get("content", []) - for listItem in listContent: - if listItem.get("type") == "listItem": - listItemContent = listItem.get("content", []) - for listParagraph in listItemContent: - if listParagraph.get("type") == "paragraph": - listParagraphContent = listParagraph.get("content", []) - if listParagraphContent: - indent = " " * listLevel - bulletText = extractTextFromContent(listParagraphContent, listLevel + 1) - if bulletText.strip(): - textParts.append(f"{indent}• {bulletText}") - - elif itemType == "orderedList": - listContent = item.get("content", []) - for listItem in listContent: - if listItem.get("type") == "listItem": - listItemContent = listItem.get("content", []) - for listParagraph in listItemContent: - if listParagraph.get("type") == "paragraph": - listParagraphContent = listParagraph.get("content", []) - if listParagraphContent: - indent = " " * listLevel - orderedText = extractTextFromContent(listParagraphContent, listLevel + 1) - if orderedText.strip(): - textParts.append(f"{indent}{listCounter}. {orderedText}") - listCounter += 1 - - elif itemType == "listItem": - # Handle nested list items - listItemContent = item.get("content", []) - if listItemContent: - textParts.append(extractTextFromContent(listItemContent, listLevel)) - - elif itemType == "embedCard": - # Handle embedded content (videos, etc.) - attrs = item.get("attrs", {}) - url = attrs.get("url", "") - if url: - textParts.append(f"[Embedded Content: {url}]") - - elif itemType == "codeBlock": - # Handle code blocks - codeContent = item.get("content", []) - if codeContent: - codeText = extractTextFromContent(codeContent, listLevel) - if codeText.strip(): - textParts.append(f"```\n{codeText}\n```") - - elif itemType == "blockquote": - # Handle blockquotes - quoteContent = item.get("content", []) - if quoteContent: - quoteText = extractTextFromContent(quoteContent, listLevel) - if quoteText.strip(): - textParts.append(f"> {quoteText}") - - elif itemType == "heading": - # Handle headings - headingContent = item.get("content", []) - if headingContent: - headingText = extractTextFromContent(headingContent, listLevel) - if headingText.strip(): - level = item.get("attrs", {}).get("level", 1) - textParts.append(f"{'#' * level} {headingText}") - - elif itemType == "rule": - # Handle horizontal rules - textParts.append("---") - - else: - # Handle unknown types by trying to extract content - if "content" in item: - contentText = extractTextFromContent(item.get("content", []), listLevel) - if contentText.strip(): - textParts.append(contentText) - - return "\n".join(textParts) - - result = extractTextFromContent(content) - return result.strip() - - def _getDocumentData(self, documentReference: str) -> Any: - """Get document data from a document reference (string or document object).""" - try: - if isinstance(documentReference, str): - # Get document from workflow - documentList = DocumentReferenceList.from_string_list([documentReference]) - chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList) - if not chatDocuments or len(chatDocuments) == 0: - return None - document = chatDocuments[0] - return document.documentData - else: - # Assume it's already a document object - return documentReference.documentData if hasattr(documentReference, 'documentData') else documentReference - except Exception as e: - logger.error(f"Error getting document data: {str(e)}") - return None - - def _parseJsonFromDocument(self, documentReference: str) -> Optional[Dict[str, Any]]: - """Parse JSON from a document reference.""" - data = self._getDocumentData(documentReference) - if data is None: - return None - - if isinstance(data, str): - try: - return json.loads(data) - except json.JSONDecodeError: - return None - elif isinstance(data, dict): - return data - else: - return None - - @action - async def connectJira(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Connect to JIRA instance and create ticket interface. - - Parameters: - - apiUsername (str, required): JIRA API username/email - - apiTokenConfigKey (str, required): APP_CONFIG key name for JIRA API token - - apiUrl (str, required): JIRA instance URL (e.g., https://example.atlassian.net) - - projectCode (str, required): JIRA project code (e.g., "DCS") - - issueType (str, required): JIRA issue type (e.g., "Task") - - taskSyncDefinition (str or dict, required): Field mapping definition as JSON string or dict - - Returns: - - ActionResult with ActionDocument containing connection ID - """ - try: - apiUsername = parameters.get("apiUsername") - if not apiUsername: - return ActionResult.isFailure(error="apiUsername parameter is required") - - apiTokenConfigKey = parameters.get("apiTokenConfigKey") - if not apiTokenConfigKey: - return ActionResult.isFailure(error="apiTokenConfigKey parameter is required") - - apiUrl = parameters.get("apiUrl") - if not apiUrl: - return ActionResult.isFailure(error="apiUrl parameter is required") - - projectCode = parameters.get("projectCode") - if not projectCode: - return ActionResult.isFailure(error="projectCode parameter is required") - - issueType = parameters.get("issueType") - if not issueType: - return ActionResult.isFailure(error="issueType parameter is required") - - taskSyncDefinitionParam = parameters.get("taskSyncDefinition") - if not taskSyncDefinitionParam: - return ActionResult.isFailure(error="taskSyncDefinition parameter is required") - - # Parse taskSyncDefinition - if isinstance(taskSyncDefinitionParam, str): - try: - taskSyncDefinition = json.loads(taskSyncDefinitionParam) - except json.JSONDecodeError as e: - return ActionResult.isFailure(error=f"taskSyncDefinition is not valid JSON: {str(e)}") - elif isinstance(taskSyncDefinitionParam, dict): - taskSyncDefinition = taskSyncDefinitionParam - else: - return ActionResult.isFailure(error=f"taskSyncDefinition must be a dict or JSON string, got {type(taskSyncDefinitionParam)}") - - # Get API token from APP_CONFIG - apiToken = APP_CONFIG.get(apiTokenConfigKey) - if not apiToken: - errorMsg = f"{apiTokenConfigKey} not found in APP_CONFIG" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - # Create ticket interface - syncInterface = await self.services.ticket.connectTicket( - taskSyncDefinition=taskSyncDefinition, - connectorType="Jira", - connectorParams={ - "apiUsername": apiUsername, - "apiToken": apiToken, - "apiUrl": apiUrl, - "projectCode": projectCode, - "ticketType": issueType, - }, - ) - - # Store connection with unique ID - import uuid - connectionId = str(uuid.uuid4()) - self._connections[connectionId] = { - "interface": syncInterface, - "taskSyncDefinition": taskSyncDefinition, - "apiUrl": apiUrl, - "projectCode": projectCode, - } - - logger.info(f"JIRA connection established: {connectionId} (Project: {projectCode})") - - # Generate filename - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "jira_connection", - "json", - workflowContext, - "connectJira" - ) - - # Create connection info document - connectionInfo = { - "connectionId": connectionId, - "apiUrl": apiUrl, - "projectCode": projectCode, - "issueType": issueType, - } - - validationMetadata = self._createValidationMetadata( - "connectJira", - connectionId=connectionId, - apiUrl=apiUrl, - projectCode=projectCode - ) - - document = ActionDocument( - documentName=filename, - documentData=json.dumps(connectionInfo, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - errorMsg = f"Error connecting to JIRA: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - @action - async def exportTicketsAsJson(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Export tickets from JIRA as JSON list. - - Parameters: - - connectionId (str, required): Connection ID from connectJira action result - - taskSyncDefinition (str or dict, optional): Field mapping definition (if not provided, uses stored definition) - - Returns: - - ActionResult with ActionDocument containing list of tickets as JSON - """ - try: - connectionIdParam = parameters.get("connectionId") - if not connectionIdParam: - return ActionResult.isFailure(error="connectionId parameter is required") - - # Get connection ID from document if it's a reference - connectionId = None - if isinstance(connectionIdParam, str): - # Try to parse from document reference - connectionInfo = self._parseJsonFromDocument(connectionIdParam) - if connectionInfo and "connectionId" in connectionInfo: - connectionId = connectionInfo["connectionId"] - else: - # Assume it's the connection ID directly - connectionId = connectionIdParam - - if not connectionId or connectionId not in self._connections: - return ActionResult.isFailure(error=f"Connection ID {connectionIdParam} not found. Ensure connectJira was called first.") - - connection = self._connections[connectionId] - syncInterface = connection["interface"] - - # Export tickets - dataList = await syncInterface.exportTicketsAsList() - - logger.info(f"Exported {len(dataList)} tickets from JIRA") - - # Generate filename - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "jira_tickets_export", - "json", - workflowContext, - "exportTicketsAsJson" - ) - - validationMetadata = self._createValidationMetadata( - "exportTicketsAsJson", - connectionId=connectionId, - ticketCount=len(dataList) - ) - - document = ActionDocument( - documentName=filename, - documentData=json.dumps(dataList, indent=2, ensure_ascii=False), - mimeType="application/json", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - errorMsg = f"Error exporting tickets from JIRA: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - @action - async def importTicketsFromJson(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Import ticket data from JSON back to JIRA. - - Parameters: - - connectionId (str, required): Connection ID from connectJira action result - - ticketData (str, required): Document reference containing ticket data as JSON - - taskSyncDefinition (str or dict, optional): Field mapping definition (if not provided, uses stored definition) - - Returns: - - ActionResult with ActionDocument containing import result with counts - """ - try: - connectionIdParam = parameters.get("connectionId") - if not connectionIdParam: - return ActionResult.isFailure(error="connectionId parameter is required") - - ticketDataParam = parameters.get("ticketData") - if not ticketDataParam: - return ActionResult.isFailure(error="ticketData parameter is required") - - # Get connection ID from document if it's a reference - connectionId = None - if isinstance(connectionIdParam, str): - connectionInfo = self._parseJsonFromDocument(connectionIdParam) - if connectionInfo and "connectionId" in connectionInfo: - connectionId = connectionInfo["connectionId"] - else: - connectionId = connectionIdParam - - if not connectionId or connectionId not in self._connections: - return ActionResult.isFailure(error=f"Connection ID {connectionIdParam} not found. Ensure connectJira was called first.") - - connection = self._connections[connectionId] - syncInterface = connection["interface"] - - # Get ticket data from document - ticketDataJson = self._parseJsonFromDocument(ticketDataParam) - if ticketDataJson is None: - return ActionResult.isFailure(error="Could not parse ticket data from document reference") - - # Ensure it's a list - if not isinstance(ticketDataJson, list): - return ActionResult.isFailure(error="ticketData must be a JSON array") - - # Import tickets - await syncInterface.importListToTickets(ticketDataJson) - - logger.info(f"Imported {len(ticketDataJson)} tickets to JIRA") - - # Generate filename - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "jira_import_result", - "json", - workflowContext, - "importTicketsFromJson" - ) - - importResult = { - "imported": len(ticketDataJson), - "connectionId": connectionId, - } - - validationMetadata = self._createValidationMetadata( - "importTicketsFromJson", - connectionId=connectionId, - importedCount=len(ticketDataJson) - ) - - document = ActionDocument( - documentName=filename, - documentData=json.dumps(importResult, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - errorMsg = f"Error importing tickets to JIRA: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - @action - async def mergeTicketData(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Merge JIRA export data with existing SharePoint data. - - Parameters: - - jiraData (str, required): Document reference containing JIRA ticket data as JSON array - - existingData (str, required): Document reference containing existing SharePoint data as JSON array - - taskSyncDefinition (str or dict, required): Field mapping definition - - idField (str, optional): Field name to use as ID for merging (default: "ID") - - Returns: - - ActionResult with ActionDocument containing merged data and merge details - """ - try: - jiraDataParam = parameters.get("jiraData") - if not jiraDataParam: - return ActionResult.isFailure(error="jiraData parameter is required") - - existingDataParam = parameters.get("existingData") - if not existingDataParam: - return ActionResult.isFailure(error="existingData parameter is required") - - taskSyncDefinitionParam = parameters.get("taskSyncDefinition") - if not taskSyncDefinitionParam: - return ActionResult.isFailure(error="taskSyncDefinition parameter is required") - - idField = parameters.get("idField", "ID") - - # Parse taskSyncDefinition - if isinstance(taskSyncDefinitionParam, str): - try: - taskSyncDefinition = json.loads(taskSyncDefinitionParam) - except json.JSONDecodeError as e: - return ActionResult.isFailure(error=f"taskSyncDefinition is not valid JSON: {str(e)}") - elif isinstance(taskSyncDefinitionParam, dict): - taskSyncDefinition = taskSyncDefinitionParam - else: - return ActionResult.isFailure(error=f"taskSyncDefinition must be a dict or JSON string, got {type(taskSyncDefinitionParam)}") - - # Get data from documents - jiraDataJson = self._parseJsonFromDocument(jiraDataParam) - if jiraDataJson is None or not isinstance(jiraDataJson, list): - return ActionResult.isFailure(error="Could not parse jiraData as JSON array") - - existingDataJson = self._parseJsonFromDocument(existingDataParam) - if existingDataJson is None or not isinstance(existingDataJson, list): - # Empty existing data is OK - existingDataJson = [] - - # Perform merge - existingLookup = {row.get(idField): row for row in existingDataJson if row.get(idField)} - mergedData: List[dict] = [] - changes: List[str] = [] - updatedCount = addedCount = unchangedCount = 0 - - for jiraRow in jiraDataJson: - jiraId = jiraRow.get(idField) - if jiraId and jiraId in existingLookup: - existingRow = existingLookup[jiraId].copy() - rowChanges: List[str] = [] - - for fieldName, fieldConfig in taskSyncDefinition.items(): - if fieldConfig[0] == 'get': - oldValue = "" if existingRow.get(fieldName) is None else str(existingRow.get(fieldName)) - newValue = "" if jiraRow.get(fieldName) is None else str(jiraRow.get(fieldName)) - - # Convert ADF data to readable text for logging - if isinstance(newValue, dict) and newValue.get("type") == "doc": - newValueReadable = self._convertAdfToText(newValue) - if oldValue != newValueReadable: - rowChanges.append(f"{fieldName}: '{oldValue[:100]}...' -> '{newValueReadable[:100]}...'") - elif oldValue != newValue: - # Truncate long values for logging - oldTruncated = oldValue[:100] + "..." if len(oldValue) > 100 else oldValue - newTruncated = newValue[:100] + "..." if len(newValue) > 100 else newValue - rowChanges.append(f"{fieldName}: '{oldTruncated}' -> '{newTruncated}'") - - existingRow[fieldName] = jiraRow.get(fieldName) - - mergedData.append(existingRow) - if rowChanges: - updatedCount += 1 - changes.append(f"Row ID {jiraId} updated: {', '.join(rowChanges)}") - else: - unchangedCount += 1 - del existingLookup[jiraId] - else: - mergedData.append(jiraRow) - addedCount += 1 - changes.append(f"Row ID {jiraId} added as new record") - - # Add remaining existing rows - for remaining in existingLookup.values(): - mergedData.append(remaining) - unchangedCount += 1 - - mergeDetails = { - "updated": updatedCount, - "added": addedCount, - "unchanged": unchangedCount, - "changes": changes - } - - logger.info(f"Merged ticket data: {updatedCount} updated, {addedCount} added, {unchangedCount} unchanged") - - # Generate filename - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "merged_ticket_data", - "json", - workflowContext, - "mergeTicketData" - ) - - result = { - "data": mergedData, - "mergeDetails": mergeDetails - } - - validationMetadata = self._createValidationMetadata( - "mergeTicketData", - updated=updatedCount, - added=addedCount, - unchanged=unchangedCount - ) - - document = ActionDocument( - documentName=filename, - documentData=json.dumps(result, indent=2, ensure_ascii=False), - mimeType="application/json", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - errorMsg = f"Error merging ticket data: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - @action - async def parseCsvContent(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Parse CSV content with custom headers. - - Parameters: - - csvContent (str, required): Document reference containing CSV file content as bytes - - skipRows (int, optional): Number of header rows to skip (default: 2) - - hasCustomHeaders (bool, optional): Whether CSV has custom header rows (default: true) - - Returns: - - ActionResult with ActionDocument containing parsed data and headers as JSON - """ - try: - csvContentParam = parameters.get("csvContent") - if not csvContentParam: - return ActionResult.isFailure(error="csvContent parameter is required") - - skipRows = parameters.get("skipRows", 2) - hasCustomHeaders = parameters.get("hasCustomHeaders", True) - - # Get CSV content from document - csvBytes = self._getDocumentData(csvContentParam) - if csvBytes is None: - return ActionResult.isFailure(error="Could not get CSV content from document reference") - - # Convert to bytes if needed - if isinstance(csvBytes, str): - csvBytes = csvBytes.encode('utf-8') - elif not isinstance(csvBytes, bytes): - return ActionResult.isFailure(error="CSV content must be bytes or string") - - # Parse headers if hasCustomHeaders - headers = {"header1": "Header 1", "header2": "Header 2"} - if hasCustomHeaders: - csvLines = csvBytes.decode('utf-8').split('\n') - if len(csvLines) >= 2: - headers["header1"] = csvLines[0].rstrip('\r\n') - headers["header2"] = csvLines[1].rstrip('\r\n') - - # Parse CSV data - df = pd.read_csv( - io.BytesIO(csvBytes), - skiprows=skipRows, - quoting=1, - escapechar='\\', - on_bad_lines='skip', - engine='python' - ) - - # Convert to dict records - for column in df.columns: - df[column] = df[column].astype('object').fillna('') - data = df.to_dict(orient='records') - - logger.info(f"Parsed CSV: {len(data)} rows, {len(df.columns)} columns") - - # Generate filename - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "parsed_csv_data", - "json", - workflowContext, - "parseCsvContent" - ) - - result = { - "data": data, - "headers": headers, - "rowCount": len(data), - "columnCount": len(df.columns) - } - - validationMetadata = self._createValidationMetadata( - "parseCsvContent", - rowCount=len(data), - columnCount=len(df.columns), - skipRows=skipRows - ) - - document = ActionDocument( - documentName=filename, - documentData=json.dumps(result, indent=2, ensure_ascii=False), - mimeType="application/json", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - errorMsg = f"Error parsing CSV content: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - @action - async def parseExcelContent(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Parse Excel content with custom headers. - - Parameters: - - excelContent (str, required): Document reference containing Excel file content as bytes - - skipRows (int, optional): Number of header rows to skip (default: 3) - - hasCustomHeaders (bool, optional): Whether Excel has custom header rows (default: true) - - Returns: - - ActionResult with ActionDocument containing parsed data and headers as JSON - """ - try: - excelContentParam = parameters.get("excelContent") - if not excelContentParam: - return ActionResult.isFailure(error="excelContent parameter is required") - - skipRows = parameters.get("skipRows", 3) - hasCustomHeaders = parameters.get("hasCustomHeaders", True) - - # Get Excel content from document - excelBytes = self._getDocumentData(excelContentParam) - if excelBytes is None: - return ActionResult.isFailure(error="Could not get Excel content from document reference") - - # Convert to bytes if needed - if isinstance(excelBytes, str): - excelBytes = excelBytes.encode('latin-1') # Excel might have binary data - elif not isinstance(excelBytes, bytes): - return ActionResult.isFailure(error="Excel content must be bytes or string") - - # Parse Excel - df = pd.read_excel(BytesIO(excelBytes), engine='openpyxl', header=None) - - # Extract headers if hasCustomHeaders - headers = {"header1": "Header 1", "header2": "Header 2"} - if hasCustomHeaders and len(df) >= 3: - headerRow1 = df.iloc[0:1].copy() - headerRow2 = df.iloc[1:2].copy() - tableHeaders = df.iloc[2:3].copy() - dfData = df.iloc[skipRows:].copy() - dfData.columns = tableHeaders.iloc[0] - - headers = { - "header1": ",".join([str(x) if pd.notna(x) else "" for x in headerRow1.iloc[0].tolist()]), - "header2": ",".join([str(x) if pd.notna(x) else "" for x in headerRow2.iloc[0].tolist()]), - } - else: - # No custom headers, use standard parsing - if skipRows > 0: - dfData = df.iloc[skipRows:].copy() - if len(df) > skipRows: - dfData.columns = df.iloc[skipRows-1] - else: - dfData = df.copy() - - # Reset index and clean data - dfData = dfData.reset_index(drop=True) - for column in dfData.columns: - dfData[column] = dfData[column].astype('object').fillna('') - - data = dfData.to_dict(orient='records') - - logger.info(f"Parsed Excel: {len(data)} rows, {len(dfData.columns)} columns") - - # Generate filename - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "parsed_excel_data", - "json", - workflowContext, - "parseExcelContent" - ) - - result = { - "data": data, - "headers": headers, - "rowCount": len(data), - "columnCount": len(dfData.columns) - } - - validationMetadata = self._createValidationMetadata( - "parseExcelContent", - rowCount=len(data), - columnCount=len(dfData.columns), - skipRows=skipRows - ) - - document = ActionDocument( - documentName=filename, - documentData=json.dumps(result, indent=2, ensure_ascii=False), - mimeType="application/json", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - errorMsg = f"Error parsing Excel content: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - @action - async def createCsvContent(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Create CSV content with custom headers. - - Parameters: - - data (str, required): Document reference containing data as JSON (with "data" field from mergeTicketData) - - headers (str, optional): Document reference containing headers JSON (from parseCsvContent/parseExcelContent) - - columns (str or list, optional): List of column names (if not provided, extracted from taskSyncDefinition or data) - - taskSyncDefinition (str or dict, optional): Field mapping definition (used to extract column names if columns not provided) - - Returns: - - ActionResult with ActionDocument containing CSV content as bytes - """ - try: - dataParam = parameters.get("data") - if not dataParam: - return ActionResult.isFailure(error="data parameter is required") - - headersParam = parameters.get("headers") - columnsParam = parameters.get("columns") - taskSyncDefinitionParam = parameters.get("taskSyncDefinition") - - # Get data from document - dataJson = self._parseJsonFromDocument(dataParam) - if dataJson is None: - return ActionResult.isFailure(error="Could not parse data from document reference") - - # Extract data array if wrapped in object - if isinstance(dataJson, dict) and "data" in dataJson: - dataList = dataJson["data"] - elif isinstance(dataJson, list): - dataList = dataJson - else: - return ActionResult.isFailure(error="Data must be a JSON array or object with 'data' field") - - # Get headers - headers = {"header1": "Header 1", "header2": "Header 2"} - if headersParam: - headersJson = self._parseJsonFromDocument(headersParam) - if headersJson and isinstance(headersJson, dict) and "headers" in headersJson: - headers = headersJson["headers"] - elif headersJson and isinstance(headersJson, dict): - headers = headersJson - - # Get columns - if columnsParam: - if isinstance(columnsParam, str): - try: - columns = json.loads(columnsParam) if columnsParam.startswith('[') or columnsParam.startswith('{') else columnsParam.split(',') - except: - columns = columnsParam.split(',') - elif isinstance(columnsParam, list): - columns = columnsParam - else: - columns = None - elif taskSyncDefinitionParam: - # Extract columns from taskSyncDefinition - if isinstance(taskSyncDefinitionParam, str): - taskSyncDefinition = json.loads(taskSyncDefinitionParam) - else: - taskSyncDefinition = taskSyncDefinitionParam - columns = list(taskSyncDefinition.keys()) - elif dataList and len(dataList) > 0: - columns = list(dataList[0].keys()) - else: - columns = [] - - # Create DataFrame - if not dataList: - df = pd.DataFrame(columns=columns) - else: - df = pd.DataFrame(dataList) - # Ensure all columns exist - for col in columns: - if col not in df.columns: - df[col] = "" - # Reorder columns - df = df[columns] - - # Clean data - for column in df.columns: - df[column] = df[column].astype("object").fillna("") - df[column] = df[column].astype(str).str.replace('\n', '\\n', regex=False).str.replace('"', '""', regex=False) - - # Create headers with timestamp - timestamp = datetime.fromtimestamp(self.services.utils.timestampGetUtc(), UTC).strftime("%Y-%m-%d %H:%M:%S UTC") - header1Row = next(csv_module.reader([headers.get("header1", "Header 1")]), []) - header2Row = next(csv_module.reader([headers.get("header2", "Header 2")]), []) - if len(header2Row) > 1: - header2Row[1] = timestamp - - headerRow1 = pd.DataFrame([header1Row + [""] * (len(df.columns) - len(header1Row))], columns=df.columns) - headerRow2 = pd.DataFrame([header2Row + [""] * (len(df.columns) - len(header2Row))], columns=df.columns) - tableHeaders = pd.DataFrame([df.columns.tolist()], columns=df.columns) - finalDf = pd.concat([headerRow1, headerRow2, tableHeaders, df], ignore_index=True) - - # Convert to CSV bytes - out = StringIO() - finalDf.to_csv(out, index=False, header=False, quoting=1, escapechar='\\') - csvBytes = out.getvalue().encode('utf-8') - - logger.info(f"Created CSV content: {len(dataList)} rows, {len(columns)} columns") - - # Generate filename - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "ticket_sync", - "csv", - workflowContext, - "createCsvContent" - ) - - validationMetadata = self._createValidationMetadata( - "createCsvContent", - rowCount=len(dataList), - columnCount=len(columns) - ) - - # Store as base64 for document - import base64 - csvBase64 = base64.b64encode(csvBytes).decode('utf-8') - - document = ActionDocument( - documentName=filename, - documentData=csvBase64, - mimeType="application/octet-stream", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - errorMsg = f"Error creating CSV content: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - @action - async def createExcelContent(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Create Excel content with custom headers. - - Parameters: - - data (str, required): Document reference containing data as JSON (with "data" field from mergeTicketData) - - headers (str, optional): Document reference containing headers JSON (from parseExcelContent) - - columns (str or list, optional): List of column names (if not provided, extracted from taskSyncDefinition or data) - - taskSyncDefinition (str or dict, optional): Field mapping definition (used to extract column names if columns not provided) - - Returns: - - ActionResult with ActionDocument containing Excel content as bytes - """ - try: - dataParam = parameters.get("data") - if not dataParam: - return ActionResult.isFailure(error="data parameter is required") - - headersParam = parameters.get("headers") - columnsParam = parameters.get("columns") - taskSyncDefinitionParam = parameters.get("taskSyncDefinition") - - # Get data from document - dataJson = self._parseJsonFromDocument(dataParam) - if dataJson is None: - return ActionResult.isFailure(error="Could not parse data from document reference") - - # Extract data array if wrapped in object - if isinstance(dataJson, dict) and "data" in dataJson: - dataList = dataJson["data"] - elif isinstance(dataJson, list): - dataList = dataJson - else: - return ActionResult.isFailure(error="Data must be a JSON array or object with 'data' field") - - # Get headers - headers = {"header1": "Header 1", "header2": "Header 2"} - if headersParam: - headersJson = self._parseJsonFromDocument(headersParam) - if headersJson and isinstance(headersJson, dict) and "headers" in headersJson: - headers = headersJson["headers"] - elif headersJson and isinstance(headersJson, dict): - headers = headersJson - - # Get columns - if columnsParam: - if isinstance(columnsParam, str): - try: - columns = json.loads(columnsParam) if columnsParam.startswith('[') or columnsParam.startswith('{') else columnsParam.split(',') - except: - columns = columnsParam.split(',') - elif isinstance(columnsParam, list): - columns = columnsParam - else: - columns = None - elif taskSyncDefinitionParam: - # Extract columns from taskSyncDefinition - if isinstance(taskSyncDefinitionParam, str): - taskSyncDefinition = json.loads(taskSyncDefinitionParam) - else: - taskSyncDefinition = taskSyncDefinitionParam - columns = list(taskSyncDefinition.keys()) - elif dataList and len(dataList) > 0: - columns = list(dataList[0].keys()) - else: - columns = [] - - # Create DataFrame - if not dataList: - df = pd.DataFrame(columns=columns) - else: - df = pd.DataFrame(dataList) - # Ensure all columns exist - for col in columns: - if col not in df.columns: - df[col] = "" - # Reorder columns - df = df[columns] - - # Clean data - for column in df.columns: - df[column] = df[column].astype("object").fillna("") - df[column] = df[column].astype(str).str.replace('\n', '\\n', regex=False).str.replace('"', '""', regex=False) - - # Create headers with timestamp - timestamp = datetime.fromtimestamp(self.services.utils.timestampGetUtc(), UTC).strftime("%Y-%m-%d %H:%M:%S UTC") - header1Row = next(csv_module.reader([headers.get("header1", "Header 1")]), []) - header2Row = next(csv_module.reader([headers.get("header2", "Header 2")]), []) - if len(header2Row) > 1: - header2Row[1] = timestamp - - headerRow1 = pd.DataFrame([header1Row + [""] * (len(df.columns) - len(header1Row))], columns=df.columns) - headerRow2 = pd.DataFrame([header2Row + [""] * (len(df.columns) - len(header2Row))], columns=df.columns) - tableHeaders = pd.DataFrame([df.columns.tolist()], columns=df.columns) - finalDf = pd.concat([headerRow1, headerRow2, tableHeaders, df], ignore_index=True) - - # Convert to Excel bytes - buf = BytesIO() - finalDf.to_excel(buf, index=False, header=False, engine='openpyxl') - excelBytes = buf.getvalue() - - logger.info(f"Created Excel content: {len(dataList)} rows, {len(columns)} columns") - - # Generate filename - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "ticket_sync", - "xlsx", - workflowContext, - "createExcelContent" - ) - - validationMetadata = self._createValidationMetadata( - "createExcelContent", - rowCount=len(dataList), - columnCount=len(columns) - ) - - # Store as base64 for document - import base64 - excelBase64 = base64.b64encode(excelBytes).decode('utf-8') - - document = ActionDocument( - documentName=filename, - documentData=excelBase64, - mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - errorMsg = f"Error creating Excel content: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) diff --git a/modules/workflows/methods/methodJira/actions/connectJira.py b/modules/workflows/methods/methodJira/actions/connectJira.py index 8200514a..45b60cad 100644 --- a/modules/workflows/methods/methodJira/actions/connectJira.py +++ b/modules/workflows/methods/methodJira/actions/connectJira.py @@ -1,37 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Connect JIRA action for JIRA operations. -Connects to JIRA instance and creates ticket interface. -""" - import logging import json import uuid from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument from modules.shared.configuration import APP_CONFIG logger = logging.getLogger(__name__) -@action async def connectJira(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Connect to JIRA instance and create ticket interface. - - Parameters: - - apiUsername (str, required): JIRA API username/email - - apiTokenConfigKey (str, required): APP_CONFIG key name for JIRA API token - - apiUrl (str, required): JIRA instance URL (e.g., https://example.atlassian.net) - - projectCode (str, required): JIRA project code (e.g., "DCS") - - issueType (str, required): JIRA issue type (e.g., "Task") - - taskSyncDefinition (str or dict, required): Field mapping definition as JSON string or dict - - Returns: - - ActionResult with ActionDocument containing connection ID - """ try: apiUsername = parameters.get("apiUsername") if not apiUsername: diff --git a/modules/workflows/methods/methodJira/actions/createCsvContent.py b/modules/workflows/methods/methodJira/actions/createCsvContent.py index c856760e..cbec7960 100644 --- a/modules/workflows/methods/methodJira/actions/createCsvContent.py +++ b/modules/workflows/methods/methodJira/actions/createCsvContent.py @@ -1,11 +1,6 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Create CSV Content action for JIRA operations. -Creates CSV content with custom headers. -""" - import logging import json import base64 @@ -14,25 +9,11 @@ import csv as csv_module from io import StringIO from datetime import datetime, UTC from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def createCsvContent(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Create CSV content with custom headers. - - Parameters: - - data (str, required): Document reference containing data as JSON (with "data" field from mergeTicketData) - - headers (str, optional): Document reference containing headers JSON (from parseCsvContent/parseExcelContent) - - columns (str or list, optional): List of column names (if not provided, extracted from taskSyncDefinition or data) - - taskSyncDefinition (str or dict, optional): Field mapping definition (used to extract column names if columns not provided) - - Returns: - - ActionResult with ActionDocument containing CSV content as bytes - """ try: dataParam = parameters.get("data") if not dataParam: diff --git a/modules/workflows/methods/methodJira/actions/createExcelContent.py b/modules/workflows/methods/methodJira/actions/createExcelContent.py index fbf54299..631795b3 100644 --- a/modules/workflows/methods/methodJira/actions/createExcelContent.py +++ b/modules/workflows/methods/methodJira/actions/createExcelContent.py @@ -1,11 +1,6 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Create Excel Content action for JIRA operations. -Creates Excel content with custom headers. -""" - import logging import json import base64 @@ -14,25 +9,11 @@ import csv as csv_module from io import BytesIO from datetime import datetime, UTC from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def createExcelContent(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Create Excel content with custom headers. - - Parameters: - - data (str, required): Document reference containing data as JSON (with "data" field from mergeTicketData) - - headers (str, optional): Document reference containing headers JSON (from parseExcelContent) - - columns (str or list, optional): List of column names (if not provided, extracted from taskSyncDefinition or data) - - taskSyncDefinition (str or dict, optional): Field mapping definition (used to extract column names if columns not provided) - - Returns: - - ActionResult with ActionDocument containing Excel content as bytes - """ try: dataParam = parameters.get("data") if not dataParam: diff --git a/modules/workflows/methods/methodJira/actions/exportTicketsAsJson.py b/modules/workflows/methods/methodJira/actions/exportTicketsAsJson.py index 85926851..55d99654 100644 --- a/modules/workflows/methods/methodJira/actions/exportTicketsAsJson.py +++ b/modules/workflows/methods/methodJira/actions/exportTicketsAsJson.py @@ -1,31 +1,14 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Export Tickets As JSON action for JIRA operations. -Exports tickets from JIRA as JSON list. -""" - import logging import json from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def exportTicketsAsJson(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Export tickets from JIRA as JSON list. - - Parameters: - - connectionId (str, required): Connection ID from connectJira action result - - taskSyncDefinition (str or dict, optional): Field mapping definition (if not provided, uses stored definition) - - Returns: - - ActionResult with ActionDocument containing list of tickets as JSON - """ try: connectionIdParam = parameters.get("connectionId") if not connectionIdParam: diff --git a/modules/workflows/methods/methodJira/actions/importTicketsFromJson.py b/modules/workflows/methods/methodJira/actions/importTicketsFromJson.py index b17519ea..b997889e 100644 --- a/modules/workflows/methods/methodJira/actions/importTicketsFromJson.py +++ b/modules/workflows/methods/methodJira/actions/importTicketsFromJson.py @@ -1,32 +1,14 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Import Tickets From JSON action for JIRA operations. -Imports ticket data from JSON back to JIRA. -""" - import logging import json from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def importTicketsFromJson(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Import ticket data from JSON back to JIRA. - - Parameters: - - connectionId (str, required): Connection ID from connectJira action result - - ticketData (str, required): Document reference containing ticket data as JSON - - taskSyncDefinition (str or dict, optional): Field mapping definition (if not provided, uses stored definition) - - Returns: - - ActionResult with ActionDocument containing import result with counts - """ try: connectionIdParam = parameters.get("connectionId") if not connectionIdParam: diff --git a/modules/workflows/methods/methodJira/actions/mergeTicketData.py b/modules/workflows/methods/methodJira/actions/mergeTicketData.py index a8f8b486..2bd7ab74 100644 --- a/modules/workflows/methods/methodJira/actions/mergeTicketData.py +++ b/modules/workflows/methods/methodJira/actions/mergeTicketData.py @@ -1,33 +1,14 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Merge Ticket Data action for JIRA operations. -Merges JIRA export data with existing SharePoint data. -""" - import logging import json from typing import Dict, Any, List -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def mergeTicketData(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Merge JIRA export data with existing SharePoint data. - - Parameters: - - jiraData (str, required): Document reference containing JIRA ticket data as JSON array - - existingData (str, required): Document reference containing existing SharePoint data as JSON array - - taskSyncDefinition (str or dict, required): Field mapping definition - - idField (str, optional): Field name to use as ID for merging (default: "ID") - - Returns: - - ActionResult with ActionDocument containing merged data and merge details - """ try: jiraDataParam = parameters.get("jiraData") if not jiraDataParam: diff --git a/modules/workflows/methods/methodJira/actions/parseCsvContent.py b/modules/workflows/methods/methodJira/actions/parseCsvContent.py index 3038e566..bbdc2cc7 100644 --- a/modules/workflows/methods/methodJira/actions/parseCsvContent.py +++ b/modules/workflows/methods/methodJira/actions/parseCsvContent.py @@ -1,34 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Parse CSV Content action for JIRA operations. -Parses CSV content with custom headers. -""" - import logging import json import io import pandas as pd from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def parseCsvContent(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Parse CSV content with custom headers. - - Parameters: - - csvContent (str, required): Document reference containing CSV file content as bytes - - skipRows (int, optional): Number of header rows to skip (default: 2) - - hasCustomHeaders (bool, optional): Whether CSV has custom header rows (default: true) - - Returns: - - ActionResult with ActionDocument containing parsed data and headers as JSON - """ try: csvContentParam = parameters.get("csvContent") if not csvContentParam: diff --git a/modules/workflows/methods/methodJira/actions/parseExcelContent.py b/modules/workflows/methods/methodJira/actions/parseExcelContent.py index c0d64325..5ac4e548 100644 --- a/modules/workflows/methods/methodJira/actions/parseExcelContent.py +++ b/modules/workflows/methods/methodJira/actions/parseExcelContent.py @@ -1,34 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Parse Excel Content action for JIRA operations. -Parses Excel content with custom headers. -""" - import logging import json import pandas as pd from io import BytesIO from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def parseExcelContent(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Parse Excel content with custom headers. - - Parameters: - - excelContent (str, required): Document reference containing Excel file content as bytes - - skipRows (int, optional): Number of header rows to skip (default: 3) - - hasCustomHeaders (bool, optional): Whether Excel has custom header rows (default: true) - - Returns: - - ActionResult with ActionDocument containing parsed data and headers as JSON - """ try: excelContentParam = parameters.get("excelContent") if not excelContentParam: diff --git a/modules/workflows/methods/methodOutlook.py.old b/modules/workflows/methods/methodOutlook.py.old deleted file mode 100644 index 98dfbc41..00000000 --- a/modules/workflows/methods/methodOutlook.py.old +++ /dev/null @@ -1,1904 +0,0 @@ -# Copyright (c) 2025 Patrick Motsch -# All rights reserved. -""" -Microsoft Outlook Email Operations Module -""" - -import base64 -import re -import logging -from typing import Dict, Any, List, Optional -from datetime import datetime, UTC -import json -import requests - -from modules.workflows.methods.methodBase import MethodBase, action -from modules.datamodels.datamodelChat import ActionResult, ActionDocument - -logger = logging.getLogger(__name__) - -class MethodOutlook(MethodBase): - """Outlook method implementation for email operations""" - - def __init__(self, services): - """Initialize the Outlook method""" - super().__init__(services) - self.name = "outlook" - self.description = "Handle Microsoft Outlook email operations" - - def _format_timestamp_for_filename(self) -> str: - """Format current timestamp as YYYYMMDD-hhmmss for filenames.""" - return datetime.now(UTC).strftime("%Y%m%d-%H%M%S") - - def _getMicrosoftConnection(self, connectionReference: str) -> Optional[Dict[str, Any]]: - """ - Helper function to get Microsoft connection details. - """ - try: - logger.debug(f"Getting Microsoft connection for reference: {connectionReference}") - - # Get the connection from the service - userConnection = self.services.chat.getUserConnectionFromConnectionReference(connectionReference) - if not userConnection: - logger.error(f"Connection not found: {connectionReference}") - return None - - logger.debug(f"Found connection: {userConnection.id}, status: {userConnection.status.value}, authority: {userConnection.authority.value}") - - # Get a fresh token for this connection - token = self.services.chat.getFreshConnectionToken(userConnection.id) - if not token: - logger.error(f"Fresh token not found for connection: {userConnection.id}") - logger.debug(f"Connection details: {userConnection}") - return None - - logger.debug(f"Fresh token retrieved for connection {userConnection.id}") - - # Check if connection is active - if userConnection.status.value != "active": - logger.error(f"Connection is not active: {userConnection.id}, status: {userConnection.status.value}") - return None - - return { - "id": userConnection.id, - "accessToken": token.tokenAccess, - "refreshToken": token.tokenRefresh, - "scopes": ["Mail.ReadWrite", "Mail.Send", "Mail.ReadWrite.Shared", "User.Read"] # Valid Microsoft Graph API scopes - } - except Exception as e: - logger.error(f"Error getting Microsoft connection: {str(e)}") - return None - - async def _checkPermissions(self, connection: Dict[str, Any]) -> bool: - """ - Check if the current connection has the necessary permissions for Outlook operations. - """ - try: - graph_url = "https://graph.microsoft.com/v1.0" - headers = { - "Authorization": f"Bearer {connection['accessToken']}", - "Content-Type": "application/json" - } - - # Test permissions by trying to access the user's mail folder - test_url = f"{graph_url}/me/mailFolders" - response = requests.get(test_url, headers=headers) - - if response.status_code == 200: - - return True - elif response.status_code == 403: - logger.error("Permission denied - connection lacks necessary mail permissions") - logger.error("Required scopes: Mail.ReadWrite, Mail.Send, Mail.ReadWrite.Shared") - return False - else: - logger.warning(f"Permission check returned status {response.status_code}") - return False - - except Exception as e: - logger.error(f"Error checking permissions: {str(e)}") - return False - - def _sanitizeSearchQuery(self, query: str) -> str: - """ - Sanitize and validate search query for Microsoft Graph API - - Microsoft Graph API has specific requirements for search queries: - - Escape special characters properly - - Handle search operators correctly - - Ensure query format is valid - """ - if not query: - return "" - - # Clean the query - clean_query = query.strip() - - # Handle folder specifications first - if clean_query.lower().startswith('folder:'): - folder_name = clean_query[7:].strip() - if folder_name: - # Return the folder specification as-is - return clean_query - - # Remove any double quotes that might cause issues - clean_query = clean_query.replace('"', '') - - # Handle common search operators - # Recognize Graph operators including both singular and plural forms for hasAttachments - lowered = clean_query.lower() - if any(op in lowered for op in ['from:', 'to:', 'subject:', 'received:', 'hasattachment:', 'hasattachments:']): - # This is an advanced search query, return as-is - return clean_query - - # For basic text search, ensure it's safe for contains() filter - # Remove any characters that might break the OData filter syntax - # Remove or escape characters that could break OData filter syntax - safe_query = re.sub(r'[\\\'"]', '', clean_query) - - return safe_query - - def _buildSearchParameters(self, query: str, folder: str, limit: int) -> Dict[str, Any]: - """ - Build search parameters for Microsoft Graph API - - This method handles the complexity of building search parameters - while avoiding conflicts between $search and $filter parameters. - """ - params = { - "$top": limit - } - - if not query or not query.strip(): - # No query specified, just get emails from folder - if folder and folder.lower() != "all": - # Use folder name directly for well-known folders, or get folder ID - if folder.lower() in ["inbox", "drafts", "sentitems", "deleteditems"]: - params["$filter"] = f"parentFolderId eq '{folder}'" - else: - # For custom folders, we need to get the folder ID first - # This will be handled by the calling method - params["$filter"] = f"parentFolderId eq '{folder}'" - # Add orderby for basic queries - params["$orderby"] = "receivedDateTime desc" - return params - - clean_query = self._sanitizeSearchQuery(query) - - # Check if this is a folder specification (e.g., "folder:Drafts", "folder:Inbox") - if clean_query.lower().startswith('folder:'): - folder_name = clean_query[7:].strip() # Remove "folder:" prefix - if folder_name: - # This is a folder specification, not a text search - # Just filter by folder and return - params["$filter"] = f"parentFolderId eq '{folder_name}'" - params["$orderby"] = "receivedDateTime desc" - return params - - # Check if this is a complex search query with multiple operators - # Recognize Graph operators including both singular and plural forms for hasAttachments - lowered = clean_query.lower() - if any(op in lowered for op in ['from:', 'to:', 'subject:', 'received:', 'hasattachment:', 'hasattachments:']): - # This is an advanced search query, use $search - # Microsoft Graph API supports complex search syntax - params["$search"] = f'"{clean_query}"' - - # Note: When using $search, we cannot combine it with $orderby or $filter for folder - # We'll need to filter results after the API call - # Folder filtering will be done after the API call - else: - # Use $filter for basic text search, but keep it simple to avoid "InefficientFilter" error - # Microsoft Graph API has limitations on complex filters - if len(clean_query) > 50: - # If query is too long, truncate it to avoid complex filter issues - clean_query = clean_query[:50] - - - # Use only subject search to keep filter simple - # Handle wildcard queries specially - if clean_query == "*" or clean_query == "": - # For wildcard or empty query, don't use contains filter - # Just use folder filter if specified - if folder and folder.lower() != "all": - params["$filter"] = f"parentFolderId eq '{folder}'" - else: - # No filter needed for wildcard search across all folders - pass - else: - params["$filter"] = f"contains(subject,'{clean_query}')" - - # Add folder filter if specified - if folder and folder.lower() != "all": - params["$filter"] = f"{params['$filter']} and parentFolderId eq '{folder}'" - - # Add orderby for basic queries - params["$orderby"] = "receivedDateTime desc" - - - return params - - def _buildGraphFilter(self, filter_text: str) -> Dict[str, str]: - """ - Build proper Microsoft Graph API filter parameters based on filter text - - Args: - filter_text (str): The filter text to process - - Returns: - Dict[str, str]: Dictionary with either $filter or $search parameter - """ - if not filter_text: - return {} - - filter_text = filter_text.strip() - - # Handle folder specifications (e.g., "folder:Drafts", "folder:Inbox") - if filter_text.lower().startswith('folder:'): - folder_name = filter_text[7:].strip() # Remove "folder:" prefix - if folder_name: - # This is a folder specification, return empty to let the main method handle it - return {} - - # Handle search queries (from:, to:, subject:, etc.) - check this FIRST - # Support both singular and plural forms for hasAttachments - lt = filter_text.lower() - if any(lt.startswith(prefix) for prefix in ['from:', 'to:', 'subject:', 'received:', 'hasattachment:', 'hasattachments:']): - return {"$search": f'"{filter_text}"'} - - # Handle email address filters (only if it's NOT a search query) - if '@' in filter_text and '.' in filter_text and ' ' not in filter_text and not filter_text.startswith('from:'): - return {"$filter": f"from/fromAddress/address eq '{filter_text}'"} - - # Handle OData filter conditions (contains 'eq', 'ne', 'gt', 'lt', etc.) - if any(op in filter_text.lower() for op in [' eq ', ' ne ', ' gt ', ' lt ', ' ge ', ' le ', ' and ', ' or ']): - return {"$filter": filter_text} - - # Handle text content - search in subject - return {"$filter": f"contains(subject,'{filter_text}')"} - - def _getFolderId(self, folder_name: str, connection: Dict[str, Any]) -> Optional[str]: - """ - Get the folder ID for a given folder name - - This is needed for proper filtering when using advanced search queries - """ - try: - graph_url = "https://graph.microsoft.com/v1.0" - headers = { - "Authorization": f"Bearer {connection['accessToken']}", - "Content-Type": "application/json" - } - - # Get mail folders - api_url = f"{graph_url}/me/mailFolders" - response = requests.get(api_url, headers=headers) - - if response.status_code == 200: - folders_data = response.json() - all_folders = folders_data.get("value", []) - - - - # Try exact match first - for folder in all_folders: - if folder.get("displayName", "").lower() == folder_name.lower(): - - return folder.get("id") - - # Try common variations for Drafts folder - if folder_name.lower() == "drafts": - draft_variations = ["drafts", "draft", "entwürfe", "entwurf", "brouillons", "brouillon"] - for folder in all_folders: - folder_display_name = folder.get("displayName", "").lower() - if any(variation in folder_display_name for variation in draft_variations): - - return folder.get("id") - - # Try common variations for other folders - if folder_name.lower() == "sent items": - sent_variations = ["sent items", "sent", "gesendete elemente", "éléments envoyés"] - for folder in all_folders: - folder_display_name = folder.get("displayName", "").lower() - if any(variation in folder_display_name for variation in sent_variations): - - return folder.get("id") - - logger.warning(f"Folder '{folder_name}' not found. Available folders: {[f.get('displayName', 'Unknown') for f in all_folders]}") - return None - else: - logger.warning(f"Could not retrieve folders: {response.status_code}") - return None - - except Exception as e: - logger.warning(f"Error getting folder ID for '{folder_name}': {str(e)}") - return None - - @action - async def readEmails(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Read emails and metadata from a mailbox folder. - - Input requirements: connectionReference (required); optional folder, limit, filter, outputMimeType. - - Output format: JSON with emails and metadata. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - folder (str, optional): Folder to read from. Default: Inbox. - - limit (int, optional): Maximum items to return. Must be > 0. Default: 1000. - - filter (str, optional): Sender, query operators, or subject text. - - outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json". - """ - import time - operationId = None - try: - # Init progress logger - workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" - operationId = f"outlook_read_{workflowId}_{int(time.time())}" - - # Start progress tracking - parentOperationId = parameters.get('parentOperationId') - self.services.chat.progressLogStart( - operationId, - "Read Emails", - "Outlook Email Reading", - f"Folder: {parameters.get('folder', 'Inbox')}", - parentOperationId=parentOperationId - ) - - connectionReference = parameters.get("connectionReference") - folder = parameters.get("folder", "Inbox") - limit = parameters.get("limit", 10) - filter = parameters.get("filter") - outputMimeType = parameters.get("outputMimeType", "application/json") - - if not connectionReference: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Connection reference is required") - - self.services.chat.progressLogUpdate(operationId, 0.2, "Validating parameters") - - # Validate limit parameter - if limit <= 0: - limit = 1000 - logger.warning(f"Invalid limit value ({limit}), using default value 1000") - - # Validate filter parameter if provided - if filter: - # Remove any potentially dangerous characters that could break the filter - filter = filter.strip() - if len(filter) > 100: - logger.warning(f"Filter too long ({len(filter)} chars), truncating to 100 characters") - filter = filter[:100] - - - # Get Microsoft connection - self.services.chat.progressLogUpdate(operationId, 0.3, "Getting Microsoft connection") - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Read emails using Microsoft Graph API - self.services.chat.progressLogUpdate(operationId, 0.4, "Reading emails from Microsoft Graph API") - try: - # Microsoft Graph API endpoint for messages - graph_url = "https://graph.microsoft.com/v1.0" - headers = { - "Authorization": f"Bearer {connection['accessToken']}", - "Content-Type": "application/json" - } - - # Get the folder ID for the specified folder - folder_id = self._getFolderId(folder, connection) - - if folder_id: - # Build the API request with folder ID - api_url = f"{graph_url}/me/mailFolders/{folder_id}/messages" - else: - # Fallback: use folder name directly (for well-known folders like "Inbox") - api_url = f"{graph_url}/me/mailFolders/{folder}/messages" - logger.warning(f"Could not find folder ID for '{folder}', using folder name directly") - params = { - "$top": limit, - "$orderby": "receivedDateTime desc" - } - - if filter: - # Build proper Graph API filter parameters - filter_params = self._buildGraphFilter(filter) - params.update(filter_params) - - # If using $search, remove $orderby as they can't be combined - if "$search" in params: - params.pop("$orderby", None) - - # If using $filter with contains(), remove $orderby as they can't be combined - # Microsoft Graph API doesn't support contains() with orderby - if "$filter" in params and "contains(" in params["$filter"].lower(): - params.pop("$orderby", None) - - # Filter applied - - # Make the API call - - - response = requests.get(api_url, headers=headers, params=params) - - if response.status_code != 200: - logger.error(f"Graph API error: {response.status_code} - {response.text}") - logger.error(f"Request URL: {response.url}") - logger.error(f"Request headers: {headers}") - logger.error(f"Request params: {params}") - - response.raise_for_status() - - self.services.chat.progressLogUpdate(operationId, 0.7, "Processing email data") - emails_data = response.json() - email_data = { - "emails": emails_data.get("value", []), - "count": len(emails_data.get("value", [])), - "folder": folder, - "filter": filter, - "apiMetadata": { - "@odata.context": emails_data.get("@odata.context"), - "@odata.count": emails_data.get("@odata.count"), - "@odata.nextLink": emails_data.get("@odata.nextLink") - } - } - - - - except ImportError: - logger.error("requests module not available") - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="requests module not available") - except requests.exceptions.HTTPError as e: - if e.response.status_code == 400: - logger.error(f"Bad Request (400) - Invalid filter or parameter: {e.response.text}") - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=f"Invalid filter syntax. Please check your filter parameter. Error: {e.response.text}") - elif e.response.status_code == 401: - logger.error("Unauthorized (401) - Access token may be expired or invalid") - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Authentication failed. Please check your connection and try again.") - elif e.response.status_code == 403: - logger.error("Forbidden (403) - Insufficient permissions to access emails") - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Insufficient permissions to read emails from this folder.") - else: - logger.error(f"HTTP Error {e.response.status_code}: {e.response.text}") - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=f"HTTP Error {e.response.status_code}: {e.response.text}") - except Exception as e: - logger.error(f"Error reading emails from Microsoft Graph API: {str(e)}") - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=f"Failed to read emails: {str(e)}") - - # Determine output format based on MIME type - mime_type_mapping = { - "application/json": ".json", - "text/plain": ".txt", - "text/csv": ".csv" - } - output_extension = mime_type_mapping.get(outputMimeType, ".json") - output_mime_type = outputMimeType - logger.info(f"Using output format: {output_extension} ({output_mime_type})") - - - - # Create result data as JSON string - result_data = { - "connectionReference": connectionReference, - "folder": folder, - "limit": limit, - "filter": filter, - "emails": email_data, - "connection": { - "id": connection["id"], - "authority": "microsoft", - "reference": connectionReference - }, - "timestamp": self.services.utils.timestampGetUtc() - } - - validationMetadata = { - "actionType": "outlook.readEmails", - "connectionReference": connectionReference, - "folder": folder, - "limit": limit, - "filter": filter, - "emailCount": email_data.get("count", 0), - "outputMimeType": outputMimeType - } - - self.services.chat.progressLogUpdate(operationId, 0.9, f"Found {email_data.get('count', 0)} emails") - self.services.chat.progressLogFinish(operationId, True) - - return ActionResult.isSuccess( - documents=[ActionDocument( - documentName=f"outlook_emails_{self._format_timestamp_for_filename()}.json", - documentData=json.dumps(result_data, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - )] - ) - - except Exception as e: - logger.error(f"Error reading emails: {str(e)}") - if operationId: - try: - self.services.chat.progressLogFinish(operationId, False) - except: - pass # Don't fail on progress logging errors - return ActionResult.isFailure( - error=str(e) - ) - - @action - async def searchEmails(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Search emails by query and return matching items with metadata. - - Input requirements: connectionReference (required); query (required); optional folder, limit, outputMimeType. - - Output format: JSON with search results and metadata. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - query (str, required): Search expression. - - folder (str, optional): Folder scope or All. Default: All. - - limit (int, optional): Maximum items to return. Must be > 0. Default: 1000. - - outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json". - """ - try: - connectionReference = parameters.get("connectionReference") - query = parameters.get("query") - folder = parameters.get("folder", "All") - limit = parameters.get("limit", 1000) - outputMimeType = parameters.get("outputMimeType", "application/json") - - # Validate parameters - if not connectionReference: - return ActionResult.isFailure(error="Connection reference is required") - - # Validate limit parameter - if limit <= 0: - limit = 1000 - logger.warning(f"Invalid limit value ({limit}), using default value 1000") - - if not query or not query.strip(): - return ActionResult.isFailure(error="Search query is required and cannot be empty") - - # Check if this is a folder specification query - if query.strip().lower().startswith('folder:'): - folder_name = query.strip()[7:].strip() # Remove "folder:" prefix - if not folder_name: - return ActionResult.isFailure(error="Invalid folder specification. Use format 'folder:FolderName'") - logger.info(f"Search query is a folder specification: {folder_name}") - - # Validate limit - try: - limit = int(limit) - if limit <= 0: - limit = 1000 - logger.warning(f"Invalid limit value (<=0), using default value 1000") - elif limit > 1000: # Microsoft Graph API has limits - limit = 1000 - logger.warning(f"Limit {limit} exceeds maximum (1000), using 1000") - except (ValueError, TypeError): - limit = 1000 - logger.warning(f"Invalid limit value, using default value 1000") - - # Get Microsoft connection - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Search emails using Microsoft Graph API - try: - # Microsoft Graph API endpoint for searching messages - graph_url = "https://graph.microsoft.com/v1.0" - headers = { - "Authorization": f"Bearer {connection['accessToken']}", - "Content-Type": "application/json" - } - - # Get the folder ID for the specified folder if needed - folder_id = None - if folder and folder.lower() != "all": - folder_id = self._getFolderId(folder, connection) - if folder_id: - logger.debug(f"Found folder ID for '{folder}': {folder_id}") - else: - logger.warning(f"Could not find folder ID for '{folder}', using folder name directly") - - # Build the search API request - api_url = f"{graph_url}/me/messages" - params = self._buildSearchParameters(query, folder_id or folder, limit) - - # Log search parameters for debugging - logger.debug(f"Search query: '{query}'") - logger.debug(f"Search folder: '{folder}'") - logger.debug(f"Search parameters: {params}") - logger.debug(f"API URL: {api_url}") - - # Make the API call - response = requests.get(api_url, headers=headers, params=params) - - # Log response details for debugging - - - if response.status_code != 200: - # Log detailed error information - try: - error_data = response.json() - logger.error(f"Microsoft Graph API error: {response.status_code} - {error_data}") - except: - logger.error(f"Microsoft Graph API error: {response.status_code} - {response.text}") - - # Check for specific error types and provide helpful messages - if response.status_code == 400: - logger.error("Bad Request (400) - Check search query format and parameters") - logger.error(f"Search query: '{query}'") - logger.error(f"Search parameters: {params}") - logger.error(f"API URL: {api_url}") - elif response.status_code == 401: - logger.error("Unauthorized (401) - Check access token and permissions") - elif response.status_code == 403: - logger.error("Forbidden (403) - Check API permissions and scopes") - elif response.status_code == 429: - logger.error("Too Many Requests (429) - Rate limit exceeded") - - raise Exception(f"Microsoft Graph API returned {response.status_code}: {response.text}") - - response.raise_for_status() - - search_data = response.json() - emails = search_data.get("value", []) - - - - # Apply folder filtering if needed and we used $search - if folder and folder.lower() != "all" and "$search" in params: - # Get the actual folder ID for proper filtering - folder_id = self._getFolderId(folder, connection) - - if folder_id: - # Filter results by folder ID - filtered_emails = [] - for email in emails: - if email.get("parentFolderId") == folder_id: - filtered_emails.append(email) - emails = filtered_emails - logger.debug(f"Applied folder filtering: {len(filtered_emails)} emails found in folder {folder}") - else: - # Fallback: try to filter by folder name (less reliable) - filtered_emails = [] - for email in emails: - # Check if email has folder information - if hasattr(email, 'parentFolderId') and email.get('parentFolderId'): - if email.get('parentFolderId') == folder: - filtered_emails.append(email) - else: - # If no folder info, include the email (less strict filtering) - filtered_emails.append(email) - - emails = filtered_emails - logger.debug(f"Applied fallback folder filtering: {len(filtered_emails)} emails found in folder {folder}") - - # Special handling for folder specification queries - if query.strip().lower().startswith('folder:'): - folder_name = query.strip()[7:].strip() - folder_id = self._getFolderId(folder_name, connection) - if folder_id: - # Filter results to only include emails from the specified folder - filtered_emails = [] - for email in emails: - if email.get("parentFolderId") == folder_id: - filtered_emails.append(email) - emails = filtered_emails - logger.debug(f"Applied folder specification filtering: {len(filtered_emails)} emails found in folder {folder_name}") - else: - logger.warning(f"Could not find folder ID for folder specification: {folder_name}") - - - search_result = { - "query": query, - "results": emails, - "count": len(emails), - "folder": folder, - "limit": limit, - "apiMetadata": { - "@odata.context": search_data.get("@odata.context"), - "@odata.count": search_data.get("@odata.count"), - "@odata.nextLink": search_data.get("@odata.nextLink") - }, - "searchParams": params - } - - - - except ImportError: - logger.error("requests module not available") - return ActionResult.isFailure(error="requests module not available") - except Exception as e: - logger.error(f"Error searching emails via Microsoft Graph API: {str(e)}") - return ActionResult.isFailure(error=f"Failed to search emails: {str(e)}") - - # Determine output format based on MIME type - mime_type_mapping = { - "application/json": ".json", - "text/plain": ".txt", - "text/csv": ".csv" - } - output_extension = mime_type_mapping.get(outputMimeType, ".json") - output_mime_type = outputMimeType - logger.info(f"Using output format: {output_extension} ({output_mime_type})") - - - - result_data = { - "connectionReference": connectionReference, - "query": query, - "folder": folder, - "limit": limit, - "searchResults": search_result, - "connection": { - "id": connection["id"], - "authority": "microsoft", - "reference": connectionReference - }, - "timestamp": self.services.utils.timestampGetUtc() - } - - validationMetadata = { - "actionType": "outlook.searchEmails", - "connectionReference": connectionReference, - "query": query, - "folder": folder, - "limit": limit, - "resultCount": search_result.get("count", 0), - "outputMimeType": outputMimeType - } - - return ActionResult( - success=True, - documents=[ActionDocument( - documentName=f"outlook_email_search_{self._format_timestamp_for_filename()}.json", - documentData=json.dumps(result_data, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - )] - ) - - except Exception as e: - logger.error(f"Error searching emails: {str(e)}") - return ActionResult.isFailure(error=str(e)) - - async def listDrafts(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: List draft emails from a folder. - - Input requirements: connectionReference (required); optional folder, limit, outputMimeType. - - Output format: JSON with draft items and metadata. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - folder (str, optional): Drafts folder to list. Default: Drafts. - - limit (int, optional): Maximum items to return. Must be > 0. Default: 1000. - - outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json". - """ - try: - connectionReference = parameters.get("connectionReference") - folder = parameters.get("folder", "Drafts") - limit = parameters.get("limit", 1000) - outputMimeType = parameters.get("outputMimeType", "application/json") - - if not connectionReference: - return ActionResult.isFailure(error="Connection reference is required") - - # Get Microsoft connection - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # List drafts using Microsoft Graph API - try: - # Microsoft Graph API endpoint for messages - graph_url = "https://graph.microsoft.com/v1.0" - headers = { - "Authorization": f"Bearer {connection['accessToken']}", - "Content-Type": "application/json" - } - - # Get the folder ID for the specified folder - folder_id = self._getFolderId(folder, connection) - - if folder_id: - # List messages in the specific folder - api_url = f"{graph_url}/me/mailFolders/{folder_id}/messages" - - else: - # Fallback: list all messages (might include drafts) - api_url = f"{graph_url}/me/messages" - logger.warning(f"Could not find folder '{folder}', listing all messages") - - params = { - "$top": limit, - "$orderby": "lastModifiedDateTime desc", - "$select": "id,subject,from,toRecipients,ccRecipients,bccRecipients,receivedDateTime,lastModifiedDateTime,parentFolderId,isDraft" - } - - # Make the API call - response = requests.get(api_url, headers=headers, params=params) - response.raise_for_status() - - messages_data = response.json() - messages = messages_data.get("value", []) - - # Filter for drafts if we're looking at all messages - if not folder_id: - drafts = [msg for msg in messages if msg.get("isDraft", False)] - messages = drafts - - - drafts_result = { - "folder": folder, - "folderId": folder_id, - "drafts": messages, - "count": len(messages), - "limit": limit, - "apiResponse": messages_data - } - - - - except ImportError: - logger.error("requests module not available") - return ActionResult.isFailure(error="requests module not available") - except Exception as e: - logger.error(f"Error listing drafts via Microsoft Graph API: {str(e)}") - return ActionResult.isFailure(error=f"Failed to list drafts: {str(e)}") - - # Determine output format based on MIME type - mime_type_mapping = { - "application/json": ".json", - "text/plain": ".txt", - "text/csv": ".csv" - } - output_extension = mime_type_mapping.get(outputMimeType, ".json") - output_mime_type = outputMimeType - logger.info(f"Using output format: {output_extension} ({output_mime_type})") - - - - result_data = { - "connectionReference": connectionReference, - "folder": folder, - "limit": limit, - "draftsResult": drafts_result, - "connection": { - "id": connection["id"], - "authority": "microsoft", - "reference": connectionReference - }, - "timestamp": self.services.utils.timestampGetUtc() - } - - validationMetadata = { - "actionType": "outlook.listDrafts", - "connectionReference": connectionReference, - "folder": folder, - "limit": limit, - "draftCount": drafts_result.get("count", 0), - "outputMimeType": outputMimeType - } - - return ActionResult( - success=True, - documents=[ActionDocument( - documentName=f"outlook_drafts_list_{self._format_timestamp_for_filename()}.json", - documentData=json.dumps(result_data, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - )] - ) - - except Exception as e: - logger.error(f"Error listing drafts: {str(e)}") - return ActionResult.isFailure(error=str(e)) - - async def findDrafts(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Find draft emails across folders. - - Input requirements: connectionReference (required); optional limit, outputMimeType. - - Output format: JSON with drafts and metadata. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - limit (int, optional): Maximum items to return. Default: 50. - - outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json". - """ - try: - connectionReference = parameters.get("connectionReference") - limit = parameters.get("limit", 50) - outputMimeType = parameters.get("outputMimeType", "application/json") - - if not connectionReference: - return ActionResult.isFailure(error="Connection reference is required") - - # Get Microsoft connection - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Find drafts using Microsoft Graph API - try: - # Microsoft Graph API endpoint for messages - graph_url = "https://graph.microsoft.com/v1.0" - headers = { - "Authorization": f"Bearer {connection['accessToken']}", - "Content-Type": "application/json" - } - - # Get all messages and filter for drafts - api_url = f"{graph_url}/me/messages" - params = { - "$top": limit, - "$select": "id,subject,from,toRecipients,ccRecipients,bccRecipients,receivedDateTime,lastModifiedDateTime,parentFolderId,isDraft,webLink", - "$filter": "isDraft eq true" - } - - - - # Make the API call - response = requests.get(api_url, headers=headers, params=params) - response.raise_for_status() - - messages_data = response.json() - drafts = messages_data.get("value", []) - - # Get folder information for each draft - for draft in drafts: - if "parentFolderId" in draft: - folder_info = self._getFolderNameById(draft["parentFolderId"], connection) - draft["folderName"] = folder_info - - drafts_result = { - "totalDrafts": len(drafts), - "drafts": drafts, - "limit": limit, - "apiResponse": messages_data - } - - - - except ImportError: - logger.error("requests module not available") - return ActionResult.isFailure(error="requests module not available") - except Exception as e: - logger.error(f"Error finding drafts via Microsoft Graph API: {str(e)}") - return ActionResult.isFailure(error=f"Failed to find drafts: {str(e)}") - - # Determine output format based on MIME type - mime_type_mapping = { - "application/json": ".json", - "text/plain": ".txt", - "text/csv": ".csv" - } - output_extension = mime_type_mapping.get(outputMimeType, ".json") - output_mime_type = outputMimeType - logger.info(f"Using output format: {output_extension} ({output_mime_type})") - - - - result_data = { - "connectionReference": connectionReference, - "limit": limit, - "draftsResult": drafts_result, - "connection": { - "id": connection["id"], - "authority": "microsoft", - "reference": connectionReference - }, - "timestamp": self.services.utils.timestampGetUtc() - } - - validationMetadata = { - "actionType": "outlook.findDrafts", - "connectionReference": connectionReference, - "limit": limit, - "totalDrafts": drafts_result.get("totalDrafts", 0), - "outputMimeType": outputMimeType - } - - return ActionResult( - success=True, - documents=[ActionDocument( - documentName=f"outlook_drafts_found_{self._format_timestamp_for_filename()}.json", - documentData=json.dumps(result_data, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - )] - ) - - except Exception as e: - logger.error(f"Error finding drafts: {str(e)}") - return ActionResult.isFailure(error=str(e)) - - def _getFolderNameById(self, folder_id: str, connection: Dict[str, Any]) -> str: - """ - Get folder name by folder ID - - This is a helper method to identify which folder a draft is in - """ - try: - graph_url = "https://graph.microsoft.com/v1.0" - headers = { - "Authorization": f"Bearer {connection['accessToken']}", - "Content-Type": "application/json" - } - - # Get folder information - api_url = f"{graph_url}/me/mailFolders/{folder_id}" - response = requests.get(api_url, headers=headers) - - if response.status_code == 200: - folder_data = response.json() - return folder_data.get("displayName", f"Unknown Folder ({folder_id})") - else: - return f"Unknown Folder ({folder_id})" - - except Exception as e: - logger.warning(f"Error getting folder name for ID '{folder_id}': {str(e)}") - return f"Unknown Folder ({folder_id})" - - async def checkDraftsFolder(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Check contents of the Drafts folder. - - Input requirements: connectionReference (required); optional limit, outputMimeType. - - Output format: JSON with drafts and metadata. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - limit (int, optional): Maximum items to return. Default: 20. - - outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json". - """ - try: - connectionReference = parameters.get("connectionReference") - limit = parameters.get("limit", 20) - outputMimeType = parameters.get("outputMimeType", "application/json") - - if not connectionReference: - return ActionResult.isFailure(error="Connection reference is required") - - # Get Microsoft connection - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Check Drafts folder directly - try: - # Microsoft Graph API endpoint for messages - graph_url = "https://graph.microsoft.com/v1.0" - headers = { - "Authorization": f"Bearer {connection['accessToken']}", - "Content-Type": "application/json" - } - - # Get the Drafts folder ID - drafts_folder_id = self._getFolderId("Drafts", connection) - - if not drafts_folder_id: - return ActionResult.isFailure(error="Could not find Drafts folder") - - # Get messages directly from Drafts folder - api_url = f"{graph_url}/me/mailFolders/{drafts_folder_id}/messages" - params = { - "$top": limit, - "$select": "id,subject,from,toRecipients,ccRecipients,bccRecipients,receivedDateTime,lastModifiedDateTime,isDraft,webLink", - "$orderby": "lastModifiedDateTime desc" - } - - - - # Make the API call - response = requests.get(api_url, headers=headers, params=params) - response.raise_for_status() - - messages_data = response.json() - drafts = messages_data.get("value", []) - - - - drafts_result = { - "draftsFolderId": drafts_folder_id, - "totalDrafts": len(drafts), - "drafts": drafts, - "limit": limit, - "apiResponse": messages_data, - "apiUrl": api_url - } - - - - except ImportError: - logger.error("requests module not available") - return ActionResult.isFailure(error="requests module not available") - except Exception as e: - logger.error(f"Error checking Drafts folder via Microsoft Graph API: {str(e)}") - return ActionResult.isFailure(error=f"Failed to check Drafts folder: {str(e)}") - - # Determine output format based on MIME type - mime_type_mapping = { - "application/json": ".json", - "text/plain": ".txt", - "text/csv": ".csv" - } - output_extension = mime_type_mapping.get(outputMimeType, ".json") - output_mime_type = outputMimeType - logger.info(f"Using output format: {output_extension} ({output_mime_type})") - - - - result_data = { - "connectionReference": connectionReference, - "limit": limit, - "draftsResult": drafts_result, - "connection": { - "id": connection["id"], - "authority": "microsoft", - "reference": connectionReference - }, - "timestamp": self.services.utils.timestampGetUtc() - } - - validationMetadata = { - "actionType": "outlook.checkDraftsFolder", - "connectionReference": connectionReference, - "limit": limit, - "totalDrafts": drafts_result.get("totalDrafts", 0), - "draftsFolderId": drafts_result.get("draftsFolderId"), - "outputMimeType": outputMimeType - } - - return ActionResult( - success=True, - documents=[ActionDocument( - documentName=f"outlook_drafts_folder_check_{self._format_timestamp_for_filename()}.json", - documentData=json.dumps(result_data, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - )] - ) - - except Exception as e: - logger.error(f"Error checking Drafts folder: {str(e)}") - return ActionResult.isFailure(error=str(e)) - - @action - async def composeAndDraftEmailWithContext(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Compose email content using AI from context and optional documents, then create a draft. - - Input requirements: connectionReference (required); to (required); context (required); optional documentList, cc, bcc, emailStyle, maxLength. - - Output format: JSON confirmation with AI-generated draft metadata. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - to (list, required): Recipient email addresses. - - context (str, required): Detailled context for composing the email. - - documentList (list, optional): Document references for context/attachments. - - cc (list, optional): CC recipients. - - bcc (list, optional): BCC recipients. - - emailStyle (str, optional): formal | casual | business. Default: business. - - maxLength (int, optional): Maximum length for generated content. Default: 1000. - """ - try: - connectionReference = parameters.get("connectionReference") - to = parameters.get("to") - context = parameters.get("context") - documentList = parameters.get("documentList", []) - cc = parameters.get("cc", []) - bcc = parameters.get("bcc", []) - emailStyle = parameters.get("emailStyle", "business") - maxLength = parameters.get("maxLength", 1000) - - if not connectionReference or not to or not context: - return ActionResult.isFailure(error="connectionReference, to, and context are required") - - # Convert single values to lists for all recipient parameters - if isinstance(to, str): - to = [to] - if isinstance(cc, str): - cc = [cc] - if isinstance(bcc, str): - bcc = [bcc] - if isinstance(documentList, str): - documentList = [documentList] - - # Get Microsoft connection - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - return ActionResult.isFailure(error="No valid Microsoft connection found") - - # Check permissions - permissions_ok = await self._checkPermissions(connection) - if not permissions_ok: - return ActionResult.isFailure(error="Connection lacks necessary permissions for Outlook operations") - - # Prepare documents for AI processing - from modules.datamodels.datamodelDocref import DocumentReferenceList - chatDocuments = [] - if documentList: - # Convert to DocumentReferenceList if needed - if isinstance(documentList, DocumentReferenceList): - docRefList = documentList - elif isinstance(documentList, list): - docRefList = DocumentReferenceList.from_string_list(documentList) - elif isinstance(documentList, str): - docRefList = DocumentReferenceList.from_string_list([documentList]) - else: - docRefList = DocumentReferenceList(references=[]) - chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docRefList) - - # Create AI prompt for email composition - # Build document reference list for AI with expanded list contents when possible - doc_references = documentList - doc_list_text = "" - if doc_references: - lines = ["Available_Document_References:"] - for ref in doc_references: - # Each item is a label: resolve to its document list and render contained items - from modules.datamodels.datamodelDocref import DocumentReferenceList - list_docs = self.services.chat.getChatDocumentsFromDocumentList(DocumentReferenceList.from_string_list([ref])) or [] - if list_docs: - for d in list_docs: - doc_ref_label = self.services.chat.getDocumentReferenceFromChatDocument(d) - lines.append(f"- {doc_ref_label}") - else: - lines.append(" - (no documents)") - doc_list_text = "\n" + "\n".join(lines) - else: - doc_list_text = "Available_Document_References: (No documents available for attachment)" - - # Escape only the user-controlled context to prevent prompt injection - escaped_context = context.replace('"', '\\"').replace('\n', '\\n').replace('\r', '\\r') - - ai_prompt = f"""Compose an email based on this context: -------- -{escaped_context} -------- - -Recipients: {to} -Style: {emailStyle} -Max length: {maxLength} characters -{doc_list_text} - -Based on the context, decide which documents to attach. - -CRITICAL: Use EXACT document references from Available_Document_References above. For individual documents: ALWAYS use docItem:: format (include filename) - -Return JSON: -{{ - "subject": "subject line", - "body": "email body (HTML allowed)", - "attachments": ["docItem::"] -}} -""" - - # Call AI service to generate email content - try: - ai_response = await self.services.ai.callAiPlanning( - prompt=ai_prompt, - placeholders=None, - debugType="email_composition" - ) - - # Parse AI response - try: - ai_content = ai_response - # Extract JSON from AI response - if "```json" in ai_content: - json_start = ai_content.find("```json") + 7 - json_end = ai_content.find("```", json_start) - json_content = ai_content[json_start:json_end].strip() - elif "{" in ai_content and "}" in ai_content: - json_start = ai_content.find("{") - json_end = ai_content.rfind("}") + 1 - json_content = ai_content[json_start:json_end] - else: - json_content = ai_content - - email_data = json.loads(json_content) - subject = email_data.get("subject", "") - body = email_data.get("body", "") - ai_attachments = email_data.get("attachments", []) - - if not subject or not body: - return ActionResult.isFailure(error="AI did not generate valid subject and body") - - # Use AI-selected attachments if provided, otherwise use all documents - if documentList: - try: - available_refs = [documentList] if isinstance(documentList, str) else documentList - from modules.datamodels.datamodelDocref import DocumentReferenceList - available_docs = self.services.chat.getChatDocumentsFromDocumentList(DocumentReferenceList.from_string_list(available_refs)) or [] - except Exception: - available_docs = [] - - # Normalize AI attachments to a list of strings - if isinstance(ai_attachments, str): - ai_attachments = [ai_attachments] - elif isinstance(ai_attachments, list): - ai_attachments = [a for a in ai_attachments if isinstance(a, str)] - - # Initialize normalized_ai_attachments - normalized_ai_attachments = [] - - if ai_attachments: - try: - ai_refs = [ai_attachments] if isinstance(ai_attachments, str) else ai_attachments - from modules.datamodels.datamodelDocref import DocumentReferenceList - ai_docs = self.services.chat.getChatDocumentsFromDocumentList(DocumentReferenceList.from_string_list(ai_refs)) or [] - except Exception: - ai_docs = [] - - # Intersect by document id - available_ids = {getattr(d, 'id', None) for d in available_docs} - selected_docs = [d for d in ai_docs if getattr(d, 'id', None) in available_ids] - - if selected_docs: - # Map selected ChatDocuments back to docItem references (with full filename) - documentList = [self.services.chat.getDocumentReferenceFromChatDocument(d) for d in selected_docs] - # Normalize ai_attachments to full format for storage - normalized_ai_attachments = documentList.copy() - logger.info(f"AI selected {len(documentList)} documents for attachment (resolved via ChatDocuments)") - else: - # No intersection; use all available documents - documentList = [self.services.chat.getDocumentReferenceFromChatDocument(d) for d in available_docs] - normalized_ai_attachments = documentList.copy() - logger.warning("AI selected attachments not found in available documents, using all documents") - else: - # No AI selection; use all available documents - documentList = [self.services.chat.getDocumentReferenceFromChatDocument(d) for d in available_docs] - normalized_ai_attachments = documentList.copy() - logger.warning("AI did not specify attachments, using all available documents") - else: - logger.info("No documents provided in documentList; skipping attachment processing") - - except json.JSONDecodeError as e: - logger.error(f"Failed to parse AI response as JSON: {str(e)}") - logger.error(f"AI response content: {ai_response}") - return ActionResult.isFailure(error="AI response was not valid JSON format") - - except Exception as e: - logger.error(f"Error calling AI service: {str(e)}") - return ActionResult.isFailure(error=f"Failed to generate email content: {str(e)}") - - # Now create the email with AI-generated content - try: - graph_url = "https://graph.microsoft.com/v1.0" - headers = { - "Authorization": f"Bearer {connection['accessToken']}", - "Content-Type": "application/json" - } - - # Clean and format body content - cleaned_body = body.strip() - - # Check if body is already HTML - if cleaned_body.startswith('') or cleaned_body.startswith('') or '
' in cleaned_body: - html_body = cleaned_body - else: - # Convert plain text to proper HTML formatting - html_body = cleaned_body.replace('\n', '
') - html_body = f"{html_body}" - - # Build the email message - message = { - "subject": subject, - "body": { - "contentType": "HTML", - "content": html_body - }, - "toRecipients": [{"emailAddress": {"address": email}} for email in to], - "ccRecipients": [{"emailAddress": {"address": email}} for email in cc] if cc else [], - "bccRecipients": [{"emailAddress": {"address": email}} for email in bcc] if bcc else [] - } - - # Add documents as attachments if provided - if documentList: - message["attachments"] = [] - for attachment_ref in documentList: - # Get attachment document from service center - from modules.datamodels.datamodelDocref import DocumentReferenceList - attachment_docs = self.services.chat.getChatDocumentsFromDocumentList(DocumentReferenceList.from_string_list([attachment_ref])) - if attachment_docs: - for doc in attachment_docs: - file_id = getattr(doc, 'fileId', None) - if file_id: - try: - file_content = self.services.chat.getFileData(file_id) - if file_content: - if isinstance(file_content, bytes): - content_bytes = file_content - else: - content_bytes = str(file_content).encode('utf-8') - - base64_content = base64.b64encode(content_bytes).decode('utf-8') - - attachment = { - "@odata.type": "#microsoft.graph.fileAttachment", - "name": doc.fileName, - "contentType": doc.mimeType or "application/octet-stream", - "contentBytes": base64_content - } - message["attachments"].append(attachment) - except Exception as e: - logger.error(f"Error reading attachment file {doc.fileName}: {str(e)}") - - # Create the draft message - drafts_folder_id = self._getFolderId("Drafts", connection) - - if drafts_folder_id: - api_url = f"{graph_url}/me/mailFolders/{drafts_folder_id}/messages" - else: - api_url = f"{graph_url}/me/messages" - logger.warning("Could not find Drafts folder, creating draft in default location") - - response = requests.post(api_url, headers=headers, json=message) - - if response.status_code in [200, 201]: - draft_data = response.json() - draft_id = draft_data.get("id", "Unknown") - - # Create draft result data with full draft information - draftResultData = { - "status": "draft", - "message": "Email draft created successfully with AI-generated content", - "draftId": draft_id, - "folder": "Drafts (Entwürfe)", - "mailbox": connection.get('userEmail', 'Unknown'), - "subject": subject, - "body": body, - "recipients": to, - "cc": cc, - "bcc": bcc, - "attachments": len(documentList), - "aiSelectedAttachments": normalized_ai_attachments if normalized_ai_attachments else "all documents", - "aiGenerated": True, - "context": context, - "emailStyle": emailStyle, - "timestamp": self.services.utils.timestampGetUtc(), - "draftData": draft_data - } - - # Extract attachment filenames for validation metadata - attachmentFilenames = [] - attachmentReferences = [] - if documentList: - try: - from modules.datamodels.datamodelDocref import DocumentReferenceList - attached_docs = self.services.chat.getChatDocumentsFromDocumentList(DocumentReferenceList.from_string_list(documentList)) or [] - attachmentFilenames = [getattr(doc, 'fileName', '') for doc in attached_docs if getattr(doc, 'fileName', None)] - # Store normalized document references (with filenames) - use normalized_ai_attachments if available - attachmentReferences = normalized_ai_attachments if normalized_ai_attachments else [self.services.chat.getDocumentReferenceFromChatDocument(d) for d in attached_docs] - except Exception: - pass - - # Create validation metadata for content validator - validationMetadata = { - "actionType": "outlook.composeAndDraftEmailWithContext", - "emailRecipients": to, - "emailCc": cc, - "emailBcc": bcc, - "emailSubject": subject, - "emailAttachments": attachmentFilenames, - "emailAttachmentReferences": attachmentReferences, - "emailAttachmentCount": len(attachmentFilenames), - "emailStyle": emailStyle, - "hasAttachments": len(attachmentFilenames) > 0 - } - - return ActionResult( - success=True, - documents=[ActionDocument( - documentName=f"ai_generated_email_draft_{self._format_timestamp_for_filename()}.json", - documentData=json.dumps(draftResultData, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - )] - ) - else: - logger.error(f"Failed to create draft. Status: {response.status_code}, Response: {response.text}") - return ActionResult.isFailure(error=f"Failed to create email draft: {response.status_code} - {response.text}") - - except Exception as e: - logger.error(f"Error creating email via Microsoft Graph API: {str(e)}") - return ActionResult.isFailure(error=f"Failed to create email: {str(e)}") - - except Exception as e: - logger.error(f"Error in composeAndDraftEmailWithContext: {str(e)}") - return ActionResult.isFailure(error=str(e)) - - @action - async def sendDraftEmail(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Send draft email(s) using draft email JSON document(s) from action outlook.composeAndDraftEmailWithContext. - - Input requirements: connectionReference (required); documentList with draft email JSON documents (required). - - Output format: JSON confirmation with sent mail metadata for all emails. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - documentList (list, required): Document reference(s) to draft emails in JSON format (outputs from outlook.composeAndDraftEmailWithContext function). - """ - import time - operationId = None - try: - # Init progress logger - workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" - operationId = f"outlook_send_{workflowId}_{int(time.time())}" - - # Start progress tracking - parentOperationId = parameters.get('parentOperationId') - self.services.chat.progressLogStart( - operationId, - "Send Draft Email", - "Outlook Email Sending", - f"Processing {len(parameters.get('documentList', []))} draft(s)", - parentOperationId=parentOperationId - ) - - connectionReference = parameters.get("connectionReference") - documentList = parameters.get("documentList", []) - - if not connectionReference: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Connection reference is required") - - if not documentList: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="documentList is required and cannot be empty") - - # Convert single value to list if needed - if isinstance(documentList, str): - documentList = [documentList] - - # Get Microsoft connection - self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection") - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Check permissions - self.services.chat.progressLogUpdate(operationId, 0.3, "Checking permissions") - permissions_ok = await self._checkPermissions(connection) - if not permissions_ok: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Connection lacks necessary permissions for Outlook operations") - - # Read draft email JSON documents from documentList - self.services.chat.progressLogUpdate(operationId, 0.4, "Reading draft email documents") - draftEmails = [] - for docRef in documentList: - try: - # Get documents from document reference - from modules.datamodels.datamodelDocref import DocumentReferenceList - chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(DocumentReferenceList.from_string_list([docRef])) - if not chatDocuments: - logger.warning(f"No documents found for reference: {docRef}") - continue - - # Process each document in the reference - for doc in chatDocuments: - try: - # Read file data - fileId = getattr(doc, 'fileId', None) - if not fileId: - logger.warning(f"Document {doc.fileName} has no fileId") - continue - - fileData = self.services.chat.getFileData(fileId) - if not fileData: - logger.warning(f"No file data found for document: {doc.fileName}") - continue - - # Parse JSON content - if isinstance(fileData, bytes): - jsonContent = fileData.decode('utf-8') - else: - jsonContent = str(fileData) - - # Parse JSON - handle both direct JSON and JSON wrapped in documentData - try: - draftEmailData = json.loads(jsonContent) - - # If the JSON contains a 'documentData' field, extract it - if isinstance(draftEmailData, dict) and 'documentData' in draftEmailData: - documentDataStr = draftEmailData['documentData'] - if isinstance(documentDataStr, str): - draftEmailData = json.loads(documentDataStr) - - # Validate draft email structure - if not isinstance(draftEmailData, dict): - logger.warning(f"Document {doc.fileName} does not contain a valid draft email JSON object") - continue - - draftId = draftEmailData.get("draftId") - if not draftId: - logger.warning(f"Document {doc.fileName} does not contain 'draftId' field") - continue - - draftEmails.append({ - "draftEmailJson": draftEmailData, - "draftId": draftId, - "sourceDocument": doc.fileName, - "sourceReference": docRef - }) - - except json.JSONDecodeError as e: - logger.error(f"Failed to parse JSON from document {doc.fileName}: {str(e)}") - continue - - except Exception as e: - logger.error(f"Error processing document {doc.fileName}: {str(e)}") - continue - - except Exception as e: - logger.error(f"Error reading documents from reference {docRef}: {str(e)}") - continue - - if not draftEmails: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No valid draft email JSON documents found in documentList") - - self.services.chat.progressLogUpdate(operationId, 0.6, f"Found {len(draftEmails)} draft email(s) to send") - - # Send all draft emails - graph_url = "https://graph.microsoft.com/v1.0" - headers = { - "Authorization": f"Bearer {connection['accessToken']}", - "Content-Type": "application/json" - } - - sentResults = [] - failedResults = [] - - self.services.chat.progressLogUpdate(operationId, 0.7, "Sending emails") - for idx, draftEmail in enumerate(draftEmails): - draftEmailJson = draftEmail["draftEmailJson"] - draftId = draftEmail["draftId"] - sourceDocument = draftEmail["sourceDocument"] - - try: - send_url = f"{graph_url}/me/messages/{draftId}/send" - sendResponse = requests.post(send_url, headers=headers) - - # Extract email details from draft JSON for confirmation - subject = draftEmailJson.get("subject", "Unknown") - recipients = draftEmailJson.get("recipients", []) - cc = draftEmailJson.get("cc", []) - bcc = draftEmailJson.get("bcc", []) - attachmentsCount = draftEmailJson.get("attachments", 0) - - if sendResponse.status_code in [200, 202, 204]: - sentResults.append({ - "status": "sent", - "message": "Email sent successfully", - "draftId": draftId, - "subject": subject, - "recipients": recipients, - "cc": cc, - "bcc": bcc, - "attachments": attachmentsCount, - "sentTimestamp": self.services.utils.timestampGetUtc(), - "sourceDocument": sourceDocument - }) - logger.info(f"Email sent successfully. Draft ID: {draftId}, Subject: {subject}") - self.services.chat.progressLogUpdate(operationId, 0.7 + (idx + 1) * 0.2 / len(draftEmails), f"Sent {idx + 1}/{len(draftEmails)}: {subject}") - else: - errorResult = { - "status": "error", - "message": "Failed to send draft email", - "draftId": draftId, - "subject": subject, - "recipients": recipients, - "sendError": { - "statusCode": sendResponse.status_code, - "response": sendResponse.text - }, - "sentTimestamp": self.services.utils.timestampGetUtc(), - "sourceDocument": sourceDocument - } - failedResults.append(errorResult) - logger.error(f"Failed to send email. Draft ID: {draftId}, Status: {sendResponse.status_code}, Response: {sendResponse.text}") - - except Exception as e: - errorResult = { - "status": "error", - "message": f"Exception while sending draft email: {str(e)}", - "draftId": draftId, - "subject": draftEmailJson.get("subject", "Unknown"), - "recipients": draftEmailJson.get("recipients", []), - "exception": str(e), - "sentTimestamp": self.services.utils.timestampGetUtc(), - "sourceDocument": sourceDocument - } - failedResults.append(errorResult) - logger.error(f"Error sending draft email {draftId}: {str(e)}") - - # Build result summary - totalEmails = len(draftEmails) - successfulEmails = len(sentResults) - failedEmails = len(failedResults) - - resultData = { - "totalEmails": totalEmails, - "successfulEmails": successfulEmails, - "failedEmails": failedEmails, - "sentResults": sentResults, - "failedResults": failedResults, - "timestamp": self.services.utils.timestampGetUtc() - } - - # Determine overall success status - self.services.chat.progressLogUpdate(operationId, 0.9, f"Sent {successfulEmails}/{totalEmails} email(s)") - if successfulEmails == 0: - self.services.chat.progressLogFinish(operationId, False) - validationMetadata = { - "actionType": "outlook.sendDraftEmail", - "connectionReference": connectionReference, - "totalEmails": totalEmails, - "successfulEmails": successfulEmails, - "failedEmails": failedEmails, - "status": "all_failed" - } - return ActionResult.isFailure( - error=f"Failed to send all {totalEmails} email(s)", - documents=[ActionDocument( - documentName=f"sent_mail_confirmation_{self._format_timestamp_for_filename()}.json", - documentData=json.dumps(resultData, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - )] - ) - elif failedEmails > 0: - # Partial success - logger.warning(f"Sent {successfulEmails} out of {totalEmails} emails. {failedEmails} failed.") - validationMetadata = { - "actionType": "outlook.sendDraftEmail", - "connectionReference": connectionReference, - "totalEmails": totalEmails, - "successfulEmails": successfulEmails, - "failedEmails": failedEmails, - "status": "partial_success" - } - self.services.chat.progressLogFinish(operationId, True) - return ActionResult( - success=True, - documents=[ActionDocument( - documentName=f"sent_mail_confirmation_{self._format_timestamp_for_filename()}.json", - documentData=json.dumps(resultData, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - )] - ) - else: - # All successful - logger.info(f"Successfully sent all {totalEmails} email(s)") - validationMetadata = { - "actionType": "outlook.sendDraftEmail", - "connectionReference": connectionReference, - "totalEmails": totalEmails, - "successfulEmails": successfulEmails, - "failedEmails": failedEmails, - "status": "all_successful" - } - self.services.chat.progressLogFinish(operationId, True) - return ActionResult( - success=True, - documents=[ActionDocument( - documentName=f"sent_mail_confirmation_{self._format_timestamp_for_filename()}.json", - documentData=json.dumps(resultData, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - )] - ) - - except ImportError: - logger.error("requests module not available") - return ActionResult.isFailure(error="requests module not available") - except Exception as e: - logger.error(f"Error in sendDraftEmail: {str(e)}") - return ActionResult.isFailure(error=str(e)) - - async def checkPermissions(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Verify that the connection has required permissions for Outlook operations. - - Input requirements: connectionReference (required). - - Output format: JSON with permission status and details. - - Parameters: - - connectionReference (str, required): Microsoft connection label to check. - """ - try: - connectionReference = parameters.get("connectionReference") - if not connectionReference: - return ActionResult.isFailure(error="Connection reference is required") - - # Get Microsoft connection - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - return ActionResult.isFailure(error="Failed to get Microsoft connection") - - # Check permissions - permissions_ok = await self._checkPermissions(connection) - - if permissions_ok: - result_data = { - "permissions": "✅ All necessary permissions are available", - "scopes": connection.get("scopes", []), - "connectionId": connection.get("id"), - "status": "ready" - } - - validationMetadata = { - "actionType": "outlook.checkPermissions", - "connectionReference": connectionReference, - "permissionsStatus": "ready", - "hasPermissions": True - } - return ActionResult( - success=True, - documents=[ActionDocument( - documentName=f"outlook_permissions_check_{self._format_timestamp_for_filename()}.json", - documentData=json.dumps(result_data, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - )] - ) - else: - result_data = { - "permissions": "❌ Missing necessary permissions", - "requiredScopes": ["Mail.ReadWrite", "Mail.Send", "Mail.ReadWrite.Shared", "User.Read"], - "currentScopes": connection.get("scopes", []), - "connectionId": connection.get("id"), - "status": "needs_reauthentication", - "message": "Please re-authenticate your Microsoft connection to get updated permissions." - } - - validationMetadata = { - "actionType": "outlook.checkPermissions", - "connectionReference": connectionReference, - "permissionsStatus": "needs_reauthentication", - "hasPermissions": False - } - return ActionResult( - success=False, - documents=[ActionDocument( - documentName=f"outlook_permissions_check_{self._format_timestamp_for_filename()}.json", - documentData=json.dumps(result_data, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - )], - error="Connection lacks necessary permissions for Outlook operations" - ) - - except Exception as e: - logger.error(f"Error checking permissions: {str(e)}") - return ActionResult.isFailure(error=str(e)) - diff --git a/modules/workflows/methods/methodOutlook/actions/composeAndDraftEmailWithContext.py b/modules/workflows/methods/methodOutlook/actions/composeAndDraftEmailWithContext.py index 2bad3838..59604896 100644 --- a/modules/workflows/methods/methodOutlook/actions/composeAndDraftEmailWithContext.py +++ b/modules/workflows/methods/methodOutlook/actions/composeAndDraftEmailWithContext.py @@ -1,39 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Compose And Draft Email With Context action for Outlook operations. -Composes email content using AI from context and optional documents, then creates a draft. -""" - import logging import json import base64 import requests from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def composeAndDraftEmailWithContext(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Compose email content using AI from context and optional documents, then create a draft. - - Input requirements: connectionReference (required); to (required); context (required); optional documentList, cc, bcc, emailStyle, maxLength. - - Output format: JSON confirmation with AI-generated draft metadata. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - to (list, required): Recipient email addresses. - - context (str, required): Detailled context for composing the email. - - documentList (list, optional): Document references for context/attachments. - - cc (list, optional): CC recipients. - - bcc (list, optional): BCC recipients. - - emailStyle (str, optional): formal | casual | business. Default: business. - - maxLength (int, optional): Maximum length for generated content. Default: 1000. - """ try: connectionReference = parameters.get("connectionReference") to = parameters.get("to") diff --git a/modules/workflows/methods/methodOutlook/actions/readEmails.py b/modules/workflows/methods/methodOutlook/actions/readEmails.py index e698cb9f..2d325d9f 100644 --- a/modules/workflows/methods/methodOutlook/actions/readEmails.py +++ b/modules/workflows/methods/methodOutlook/actions/readEmails.py @@ -1,36 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Read Emails action for Outlook operations. -Reads emails and metadata from a mailbox folder. -""" - import logging import time import json import requests from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def readEmails(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Read emails and metadata from a mailbox folder. - - Input requirements: connectionReference (required); optional folder, limit, filter, outputMimeType. - - Output format: JSON with emails and metadata. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - folder (str, optional): Folder to read from. Default: Inbox. - - limit (int, optional): Maximum items to return. Must be > 0. Default: 1000. - - filter (str, optional): Sender, query operators, or subject text. - - outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json". - """ operationId = None try: # Init progress logger diff --git a/modules/workflows/methods/methodOutlook/actions/searchEmails.py b/modules/workflows/methods/methodOutlook/actions/searchEmails.py index 72830caf..f8831d59 100644 --- a/modules/workflows/methods/methodOutlook/actions/searchEmails.py +++ b/modules/workflows/methods/methodOutlook/actions/searchEmails.py @@ -1,35 +1,15 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Search Emails action for Outlook operations. -Searches emails by query and returns matching items with metadata. -""" - import logging import json import requests from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def searchEmails(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Search emails by query and return matching items with metadata. - - Input requirements: connectionReference (required); query (required); optional folder, limit, outputMimeType. - - Output format: JSON with search results and metadata. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - query (str, required): Search expression. - - folder (str, optional): Folder scope or All. Default: All. - - limit (int, optional): Maximum items to return. Must be > 0. Default: 1000. - - outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json". - """ try: connectionReference = parameters.get("connectionReference") query = parameters.get("query") diff --git a/modules/workflows/methods/methodOutlook/actions/sendDraftEmail.py b/modules/workflows/methods/methodOutlook/actions/sendDraftEmail.py index ffae4c8d..9b7fb011 100644 --- a/modules/workflows/methods/methodOutlook/actions/sendDraftEmail.py +++ b/modules/workflows/methods/methodOutlook/actions/sendDraftEmail.py @@ -1,33 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Send Draft Email action for Outlook operations. -Sends draft email(s) using draft email JSON document(s) from action outlook.composeAndDraftEmailWithContext. -""" - import logging import time import json import requests from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def sendDraftEmail(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Send draft email(s) using draft email JSON document(s) from action outlook.composeAndDraftEmailWithContext. - - Input requirements: connectionReference (required); documentList with draft email JSON documents (required). - - Output format: JSON confirmation with sent mail metadata for all emails. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - documentList (list, required): Document reference(s) to draft emails in JSON format (outputs from outlook.composeAndDraftEmailWithContext function). - """ operationId = None try: # Init progress logger diff --git a/modules/workflows/methods/methodSharepoint.py.old b/modules/workflows/methods/methodSharepoint.py.old deleted file mode 100644 index d12b53eb..00000000 --- a/modules/workflows/methods/methodSharepoint.py.old +++ /dev/null @@ -1,2840 +0,0 @@ -# Copyright (c) 2025 Patrick Motsch -# All rights reserved. -""" -SharePoint operations method module. -Handles SharePoint document operations using the SharePoint service. -""" - -import logging -import re -import json -from typing import Dict, Any, List, Optional -from datetime import datetime, UTC, timedelta, timezone -import urllib -import aiohttp -import asyncio - -from modules.workflows.methods.methodBase import MethodBase, action -from modules.datamodels.datamodelChat import ActionResult, ActionDocument - -logger = logging.getLogger(__name__) - -class MethodSharepoint(MethodBase): - """SharePoint operations methods.""" - - def __init__(self, services): - super().__init__(services) - self.name = "sharepoint" - self.description = "SharePoint operations methods" - - def _format_timestamp_for_filename(self) -> str: - """Format current timestamp as YYYYMMDD-hhmmss for filenames.""" - return datetime.now(UTC).strftime("%Y%m%d-%H%M%S") - - def _getMicrosoftConnection(self, connectionReference: str) -> Optional[Dict[str, Any]]: - """Get Microsoft connection from connection reference and configure SharePoint service""" - try: - userConnection = self.services.chat.getUserConnectionFromConnectionReference(connectionReference) - if not userConnection: - logger.warning(f"No user connection found for reference: {connectionReference}") - return None - - if userConnection.authority.value != "msft": - logger.warning(f"Connection {userConnection.id} is not Microsoft (authority: {userConnection.authority.value})") - return None - - # Check if connection is active or pending (pending means OAuth in progress) - if userConnection.status.value not in ["active", "pending"]: - logger.warning(f"Connection {userConnection.id} status is not active/pending: {userConnection.status.value}") - return None - - # Configure SharePoint service with the UserConnection - if not self.services.sharepoint.setAccessTokenFromConnection(userConnection): - logger.warning(f"Failed to configure SharePoint service with connection {userConnection.id}") - return None - - logger.info(f"Successfully configured SharePoint service with Microsoft connection: {userConnection.id}, status: {userConnection.status.value}, externalId: {userConnection.externalId}") - - return { - "id": userConnection.id, - "userConnection": userConnection, - "scopes": ["Sites.ReadWrite.All", "Files.ReadWrite.All", "User.Read"] # SharePoint scopes - } - except Exception as e: - logger.error(f"Error getting Microsoft connection: {str(e)}") - return None - - async def _discoverSharePointSites(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: - """ - Discover SharePoint sites accessible to the user via Microsoft Graph API - - Parameters: - limit (Optional[int]): Limit number of sites to return (for optimization when only hostname is needed) - - Returns: - List[Dict[str, Any]]: List of SharePoint site information - """ - try: - # Query Microsoft Graph to get sites the user has access to - endpoint = "sites?search=*" - if limit: - endpoint += f"&$top={limit}" - - result = await self._makeGraphApiCall(endpoint) - - if "error" in result: - logger.error(f"Error discovering SharePoint sites: {result['error']}") - return [] - - sites = result.get("value", []) - if limit: - sites = sites[:limit] - - logger.info(f"Discovered {len(sites)} SharePoint sites" + (f" (limited to {limit})" if limit else "")) - - # Process and return site information - processedSites = [] - for site in sites: - siteInfo = { - "id": site.get("id"), - "displayName": site.get("displayName"), - "name": site.get("name"), - "webUrl": site.get("webUrl"), - "description": site.get("description"), - "createdDateTime": site.get("createdDateTime"), - "lastModifiedDateTime": site.get("lastModifiedDateTime") - } - processedSites.append(siteInfo) - logger.debug(f"Site: {siteInfo['displayName']} - {siteInfo['webUrl']}") - - return processedSites - - except Exception as e: - logger.error(f"Error discovering SharePoint sites: {str(e)}") - return [] - - def _extractHostnameFromWebUrl(self, webUrl: str) -> Optional[str]: - """Extract hostname from SharePoint webUrl (e.g., https://pcuster.sharepoint.com)""" - try: - if not webUrl: - return None - parsed = urllib.parse.urlparse(webUrl) - return parsed.hostname - except Exception as e: - logger.error(f"Error extracting hostname from webUrl '{webUrl}': {str(e)}") - return None - - def _extractSiteFromStandardPath(self, pathQuery: str) -> Optional[Dict[str, str]]: - """ - Extract site name from Microsoft-standard server-relative path. - Delegates to SharePoint service. - """ - return self.services.sharepoint.extractSiteFromStandardPath(pathQuery) - - async def _getSiteByStandardPath(self, sitePath: str) -> Optional[Dict[str, Any]]: - """ - Get SharePoint site directly by Microsoft-standard path. - Delegates to SharePoint service. - """ - return await self.services.sharepoint.getSiteByStandardPath(sitePath) - - def _filterSitesByHint(self, sites: List[Dict[str, Any]], siteHint: str) -> List[Dict[str, Any]]: - """ - Filter discovered sites by a human-entered site hint. - Delegates to SharePoint service. - """ - return self.services.sharepoint.filterSitesByHint(sites, siteHint) - - def _parseSearchQuery(self, searchQuery: str) -> tuple[str, str, str, dict]: - """ - Parse searchQuery to extract path, search terms, search type, and search options. - - CRITICAL: NEVER convert words to paths! Words stay as search terms. - - "root document lesson" → fileQuery="root document lesson" (NOT "/root/document/lesson") - - "root, gose" → fileQuery="root, gose" (NOT "/root/gose") - - "druckersteuerung eskalation logobject" → fileQuery="druckersteuerung eskalation logobject" - - Parameters: - searchQuery (str): Enhanced search query with options: - - "budget" -> pathQuery="*", fileQuery="budget", searchType="all", options={} - - "root document lesson" -> pathQuery="*", fileQuery="root document lesson", searchType="all", options={} - - "root, gose" -> pathQuery="*", fileQuery="root, gose", searchType="all", options={} - - "/Documents:budget" -> pathQuery="/Documents", fileQuery="budget", searchType="all", options={} - - "files:budget" -> pathQuery="*", fileQuery="budget", searchType="files", options={} - - "folders:DELTA" -> pathQuery="*", fileQuery="DELTA", searchType="folders", options={} - - "exact:\"Operations 2025\"" -> exact phrase matching - - "regex:^Operations.*2025$" -> regex pattern matching - - "case:DELTA" -> case-sensitive search - - "and:DELTA AND 2025 Mars AND Group" -> all AND terms must be present - - Returns: - tuple[str, str, str, dict]: (pathQuery, fileQuery, searchType, searchOptions) - """ - try: - if not searchQuery or not searchQuery.strip() or searchQuery.strip() == "*": - return "*", "*", "all", {} - - searchQuery = searchQuery.strip() - searchOptions = {} - - # CRITICAL: Do NOT convert space-separated or comma-separated words to paths! - # "root document lesson" should stay as "root document lesson", NOT "/root/document/lesson" - # "root, gose" should stay as "root, gose", NOT "/root/gose" - - # Check for search type specification (files:, folders:, all:) FIRST - searchType = "all" # Default - if searchQuery.startswith(("files:", "folders:", "all:")): - typeParts = searchQuery.split(':', 1) - searchType = typeParts[0].strip() - searchQuery = typeParts[1].strip() - - # Extract optional site hint tokens: support "site=Name" or leading "site:Name" - def _extractSiteHint(q: str) -> tuple[str, Optional[str]]: - try: - qStrip = q.strip() - # Leading form: site:KM LayerFinance ... - if qStrip.lower().startswith("site:"): - after = qStrip[5:].lstrip() - # site name until next space or end - if ' ' in after: - siteName, rest = after.split(' ', 1) - else: - siteName, rest = after, '' - return rest.strip(), siteName.strip() - # Inline key=value form anywhere - m = re.search(r"\bsite=([^;\s]+)", qStrip, flags=re.IGNORECASE) - if m: - siteName = m.group(1).strip() - # remove the token from query - qNew = re.sub(r"\bsite=[^;\s]+;?", "", qStrip, flags=re.IGNORECASE).strip() - return qNew, siteName - except Exception: - pass - return q, None - - searchQuery, extractedSite = _extractSiteHint(searchQuery) - if extractedSite: - searchOptions["site_hint"] = extractedSite - logger.info(f"Extracted site hint: '{extractedSite}'") - - # Extract name="..." if present (for quoted multi-word names) - nameMatch = re.search(r"name=\"([^\"]+)\"", searchQuery) - if nameMatch: - searchQuery = nameMatch.group(1) - logger.info(f"Extracted name from quotes: '{searchQuery}'") - - # Check for search mode specification (exact:, regex:, case:, and:) - if searchQuery.startswith(("exact:", "regex:", "case:", "and:")): - modeParts = searchQuery.split(':', 1) - mode = modeParts[0].strip() - searchQuery = modeParts[1].strip() - - if mode == "exact": - searchOptions["exact_match"] = True - # Remove quotes if present - if searchQuery.startswith('"') and searchQuery.endswith('"'): - searchQuery = searchQuery[1:-1] - elif mode == "regex": - searchOptions["regex_match"] = True - elif mode == "case": - searchOptions["case_sensitive"] = True - elif mode == "and": - searchOptions["and_terms"] = True - - # Check if it contains path:search format - # Microsoft-standard paths: /sites/SiteName/Path:files:.pdf - if ':' in searchQuery: - # For Microsoft-standard paths (/sites/...), find the colon that separates path from search - if searchQuery.startswith('/sites/'): - # Find the colon that separates path from search (after the full path) - # Look for pattern: /sites/SiteName/Path/...:files:.pdf - # We need to find the colon that's followed by search type or file extension - colonPositions = [] - for i, char in enumerate(searchQuery): - if char == ':': - colonPositions.append(i) - - # If we have colons, find the one that's followed by search type or file extension - splitPos = None - if colonPositions: - for pos in colonPositions: - afterColon = searchQuery[pos+1:pos+10].strip().lower() - # Check if this colon is followed by search type or looks like a file extension - if afterColon.startswith(('files:', 'folders:', 'all:', '.')) or afterColon == '': - splitPos = pos - break - - # If no clear split found, use the last colon - if splitPos is None and colonPositions: - splitPos = colonPositions[-1] - - if splitPos: - pathPart = searchQuery[:splitPos].strip() - searchPart = searchQuery[splitPos+1:].strip() - else: - # Fallback: split on first colon - parts = searchQuery.split(':', 1) - pathPart = parts[0].strip() - searchPart = parts[1].strip() - else: - # Regular path:search format - split on first colon - parts = searchQuery.split(':', 1) - pathPart = parts[0].strip() - searchPart = parts[1].strip() - - # Check if searchPart starts with search type (files:, folders:, all:) - if searchPart.startswith(("files:", "folders:", "all:")): - typeParts = searchPart.split(':', 1) - searchType = typeParts[0].strip() # Update searchType - searchPart = typeParts[1].strip() if len(typeParts) > 1 else "" - - # Handle path part - if not pathPart or pathPart == "*": - pathQuery = "*" - elif pathPart.startswith('/'): - pathQuery = pathPart - else: - pathQuery = f"/Documents/{pathPart}" - - # Handle search part - if not searchPart or searchPart == "*": - fileQuery = "*" - else: - fileQuery = searchPart - - return pathQuery, fileQuery, searchType, searchOptions - - # No colon - check if it looks like a path - elif searchQuery.startswith('/'): - # It's a path only - return searchQuery, "*", searchType, searchOptions - - else: - # It's a search term only - keep words as-is, do NOT convert to paths - # "root document lesson" stays as "root document lesson" - # "root, gose" stays as "root, gose" - return "*", searchQuery, searchType, searchOptions - - except Exception as e: - logger.error(f"Error parsing searchQuery '{searchQuery}': {str(e)}") - raise ValueError(f"Failed to parse searchQuery '{searchQuery}': {str(e)}") - - def _resolvePathQuery(self, pathQuery: str) -> List[str]: - """ - Resolve pathQuery into a list of search paths for SharePoint operations. - - Parameters: - pathQuery (str): Query string that can contain: - - Direct paths (e.g., "/Documents/Project1") - - Wildcards (e.g., "/Documents/*") - - Multiple paths separated by semicolons (e.g., "/Docs; /Files") - - Single word relative paths (e.g., "Project1" -> resolved to default folder) - - Empty string or "*" for global search - - Space-separated words are treated as search terms, NOT folder paths - - Returns: - List[str]: List of resolved paths - """ - try: - if not pathQuery or not pathQuery.strip() or pathQuery.strip() == "*": - return ["*"] # Global search across all sites - - # Split by semicolon to handle multiple paths - rawPaths = [path.strip() for path in pathQuery.split(';') if path.strip()] - resolvedPaths = [] - - for rawPath in rawPaths: - # Handle wildcards - return as-is - if '*' in rawPath: - resolvedPaths.append(rawPath) - # Handle absolute paths - elif rawPath.startswith('/'): - resolvedPaths.append(rawPath) - # Handle single word relative paths - prepend default folder - # BUT NOT space-separated words (those are search terms, not paths) - elif ' ' not in rawPath: - resolvedPaths.append(f"/Documents/{rawPath}") - else: - # Check if this looks like a path (has path separators) or search terms - if '\\' in rawPath or '/' in rawPath: - # This looks like a path with spaces in folder names - treat as valid path - resolvedPaths.append(rawPath) - logger.info(f"Path with spaces '{rawPath}' treated as valid folder path") - else: - # Space-separated words without path separators are search terms - # Return as "*" to search globally - logger.info(f"Space-separated words '{rawPath}' treated as search terms, not folder path") - resolvedPaths.append("*") - - # Remove duplicates while preserving order - seen = set() - uniquePaths = [] - for path in resolvedPaths: - if path not in seen: - seen.add(path) - uniquePaths.append(path) - - logger.info(f"Resolved pathQuery '{pathQuery}' to {len(uniquePaths)} paths: {uniquePaths}") - return uniquePaths - - except Exception as e: - logger.error(f"Error resolving pathQuery '{pathQuery}': {str(e)}") - raise ValueError(f"Failed to resolve pathQuery '{pathQuery}': {str(e)}") - - def _parseSiteUrl(self, siteUrl: str) -> Dict[str, str]: - """Parse SharePoint site URL to extract hostname and site path""" - try: - parsed = urllib.parse.urlparse(siteUrl) - hostname = parsed.hostname - path = parsed.path.strip('/') - - return { - "hostname": hostname, - "sitePath": path - } - except Exception as e: - logger.error(f"Error parsing site URL {siteUrl}: {str(e)}") - return {"hostname": "", "sitePath": ""} - - def _cleanSearchQuery(self, query: str) -> str: - """ - Clean search query to make it compatible with Graph API KQL syntax. - Removes path-like syntax and invalid KQL constructs. - - Parameters: - query (str): Raw search query that may contain paths and invalid syntax - - Returns: - str: Cleaned query suitable for Graph API search endpoint - """ - if not query or not query.strip(): - return "" - - query = query.strip() - - # Handle patterns like: "Company Share/Freigegebene Dokumente/.../expenses:files:.pdf" - # Extract the search term and file extension - - # First, extract file extension if present (format: :files:.pdf or just .pdf at the end) - fileExtension = "" - if ':files:' in query.lower() or ':folders:' in query.lower(): - # Extract extension after the type filter - extMatch = re.search(r':(?:files|folders):(\.\w+)', query, re.IGNORECASE) - if extMatch: - fileExtension = extMatch.group(1) - # Remove the type filter part - query = re.sub(r':(?:files|folders):\.?\w*', '', query, flags=re.IGNORECASE) - elif query.endswith(('.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.csv', '.ppt', '.pptx')): - # Extract extension from end - extMatch = re.search(r'(\.\w+)$', query) - if extMatch: - fileExtension = extMatch.group(1) - query = query[:-len(fileExtension)] - - # Extract search term: get the last segment after the last slash (filename part) - queryNormalized = query.replace('\\', '/') - if '/' in queryNormalized: - # Extract the last segment (usually the filename/search term) - lastSegment = queryNormalized.split('/')[-1] - # Remove any remaining colons or type filters - if ':' in lastSegment: - lastSegment = lastSegment.split(':')[0] - searchTerm = lastSegment.strip() - else: - # No path separators, use the query as-is but remove type filters - if ':' in query: - searchTerm = query.split(':')[0].strip() - else: - searchTerm = query.strip() - - # Remove any remaining type filters or invalid syntax - searchTerm = re.sub(r':(?:files|folders|all):?', '', searchTerm, flags=re.IGNORECASE) - searchTerm = searchTerm.strip() - - # If we have a file extension, include it in the search term - # Note: Graph API search endpoint may not support filetype: syntax - # So we include the extension as part of the search term or filter results after - if fileExtension: - extWithoutDot = fileExtension.lstrip('.') - # Try simple approach: add extension as search term - # If this doesn't work, we'll filter results after search - if searchTerm: - # Include extension in search - Graph API will search in filename - searchTerm = f"{searchTerm} {extWithoutDot}" - else: - searchTerm = extWithoutDot - - # Final cleanup: remove any remaining invalid characters for KQL - # Keep alphanumeric, spaces, hyphens, underscores, dots, and common search operators - searchTerm = re.sub(r'[^\w\s\-\.\*]', ' ', searchTerm) - searchTerm = ' '.join(searchTerm.split()) # Normalize whitespace - - return searchTerm if searchTerm else "*" - - async def _makeGraphApiCall(self, endpoint: str, method: str = "GET", data: bytes = None) -> Dict[str, Any]: - """Make a Microsoft Graph API call with timeout and detailed logging""" - try: - if not hasattr(self.services, 'sharepoint') or not self.services.sharepoint._target.accessToken: - return {"error": "SharePoint service not configured with access token"} - - headers = { - "Authorization": f"Bearer {self.services.sharepoint._target.accessToken}", - "Content-Type": "application/json" if data and method != "PUT" else "application/octet-stream" if data else "application/json" - } - - url = f"https://graph.microsoft.com/v1.0/{endpoint}" - logger.info(f"Making Graph API call: {method} {url}") - - # Set timeout to 30 seconds - timeout = aiohttp.ClientTimeout(total=30) - - async with aiohttp.ClientSession(timeout=timeout) as session: - if method == "GET": - logger.debug(f"Starting GET request to {url}") - async with session.get(url, headers=headers) as response: - logger.info(f"Graph API response: {response.status}") - if response.status == 200: - result = await response.json() - logger.debug(f"Graph API success: {len(str(result))} characters response") - return result - else: - errorText = await response.text() - logger.error(f"Graph API call failed: {response.status} - {errorText}") - return {"error": f"API call failed: {response.status} - {errorText}"} - - elif method == "PUT": - logger.debug(f"Starting PUT request to {url}") - async with session.put(url, headers=headers, data=data) as response: - logger.info(f"Graph API response: {response.status}") - if response.status in [200, 201]: - result = await response.json() - logger.debug(f"Graph API success: {len(str(result))} characters response") - return result - else: - errorText = await response.text() - logger.error(f"Graph API call failed: {response.status} - {errorText}") - return {"error": f"API call failed: {response.status} - {errorText}"} - - elif method == "POST": - logger.debug(f"Starting POST request to {url}") - async with session.post(url, headers=headers, data=data) as response: - logger.info(f"Graph API response: {response.status}") - if response.status in [200, 201]: - result = await response.json() - logger.debug(f"Graph API success: {len(str(result))} characters response") - return result - else: - errorText = await response.text() - logger.error(f"Graph API call failed: {response.status} - {errorText}") - return {"error": f"API call failed: {response.status} - {errorText}"} - - except asyncio.TimeoutError: - logger.error(f"Graph API call timed out after 30 seconds: {endpoint}") - return {"error": f"API call timed out after 30 seconds: {endpoint}"} - except Exception as e: - logger.error(f"Error making Graph API call: {str(e)}") - return {"error": f"Error making Graph API call: {str(e)}"} - - async def _getSiteId(self, hostname: str, sitePath: str) -> str: - """Get SharePoint site ID from hostname and site path""" - try: - endpoint = f"sites/{hostname}:/{sitePath}" - result = await self._makeGraphApiCall(endpoint) - - if "error" in result: - logger.error(f"Error getting site ID: {result['error']}") - return "" - - return result.get("id", "") - except Exception as e: - logger.error(f"Error getting site ID: {str(e)}") - return "" - - async def _parseDocumentListForFoundDocuments(self, documentList: Any) -> tuple[Optional[List[Dict[str, Any]]], Optional[List[Dict[str, Any]]], Optional[str]]: - """ - Parse documentList to extract foundDocuments and site information. - - Parameters: - documentList: Document list (can be list, DocumentReferenceList, or string) - - Returns: - tuple: (foundDocuments, sites, errorMessage) - - foundDocuments: List of found documents from findDocumentPath result - - sites: List of site dictionaries with id, displayName, webUrl - - errorMessage: Error message if parsing failed, None otherwise - """ - try: - if isinstance(documentList, str): - documentList = [documentList] - - # Resolve documentList to get actual documents - from modules.datamodels.datamodelDocref import DocumentReferenceList - if isinstance(documentList, DocumentReferenceList): - docRefList = documentList - elif isinstance(documentList, list): - docRefList = DocumentReferenceList.from_string_list(documentList) - else: - docRefList = DocumentReferenceList(references=[]) - - chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docRefList) - if not chatDocuments: - return None, None, "No documents found for the provided document list" - - firstDocument = chatDocuments[0] - fileData = self.services.chat.getFileData(firstDocument.fileId) - if not fileData: - return None, None, None # No fileData, but not an error (might be regular file) - - try: - resultData = json.loads(fileData) - foundDocuments = resultData.get("foundDocuments", []) - - # If no foundDocuments, check if it's a listDocuments result (has listResults) - if not foundDocuments and "listResults" in resultData: - logger.info(f"documentList contains listResults from listDocuments, converting to foundDocuments format") - listResults = resultData.get("listResults", []) - foundDocuments = [] - siteIdFromList = None - siteNameFromList = None - - for listResult in listResults: - siteResults = listResult.get("siteResults", []) - for siteResult in siteResults: - items = siteResult.get("items", []) - # Extract site info from first item if available - if items and not siteIdFromList: - siteNameFromList = items[0].get("siteName") - - for item in items: - # Convert listDocuments item format to foundDocuments format - if item.get("type") == "file": - foundDoc = { - "id": item.get("id"), - "name": item.get("name"), - "type": "file", - "siteName": item.get("siteName"), - "siteId": None, # Will be determined from site discovery - "webUrl": item.get("webUrl"), - "fullPath": item.get("webUrl", ""), - "parentPath": item.get("parentPath", "") - } - foundDocuments.append(foundDoc) - - # Discover sites to get siteId if we have siteName - if foundDocuments and siteNameFromList and not siteIdFromList: - logger.info(f"Discovering sites to find siteId for '{siteNameFromList}'") - allSites = await self._discoverSharePointSites() - matchingSites = self._filterSitesByHint(allSites, siteNameFromList) - if matchingSites: - siteIdFromList = matchingSites[0].get("id") - # Update all foundDocuments with siteId - for doc in foundDocuments: - doc["siteId"] = siteIdFromList - logger.info(f"Found siteId '{siteIdFromList}' for site '{siteNameFromList}'") - - logger.info(f"Converted {len(foundDocuments)} files from listResults format") - - if not foundDocuments: - return None, None, None # No foundDocuments, but not an error - - # Extract site information from foundDocuments - firstDoc = foundDocuments[0] - siteName = firstDoc.get("siteName") - siteId = firstDoc.get("siteId") - - # If siteId is missing (from listDocuments conversion), discover sites to find it - if siteName and not siteId: - logger.info(f"Site ID missing, discovering sites to find siteId for '{siteName}'") - allSites = await self._discoverSharePointSites() - matchingSites = self._filterSitesByHint(allSites, siteName) - if matchingSites: - siteId = matchingSites[0].get("id") - logger.info(f"Found siteId '{siteId}' for site '{siteName}'") - - sites = None - if siteName and siteId: - sites = [{ - "id": siteId, - "displayName": siteName, - "webUrl": firstDoc.get("webUrl", "") - }] - logger.info(f"Using specific site from documentList: {siteName} (ID: {siteId})") - elif siteName: - # Try to get site by name - allSites = await self._discoverSharePointSites() - matchingSites = self._filterSitesByHint(allSites, siteName) - if matchingSites: - sites = [{ - "id": matchingSites[0].get("id"), - "displayName": siteName, - "webUrl": matchingSites[0].get("webUrl", "") - }] - logger.info(f"Found site by name: {siteName} (ID: {sites[0]['id']})") - else: - return None, None, f"Site '{siteName}' not found. Cannot determine target site." - else: - return None, None, "Site information missing from documentList. Cannot determine target site." - - return foundDocuments, sites, None - - except json.JSONDecodeError as e: - return None, None, f"Invalid JSON in documentList: {str(e)}" - except Exception as e: - return None, None, f"Error processing documentList: {str(e)}" - - except Exception as e: - logger.error(f"Error parsing documentList: {str(e)}") - return None, None, f"Error parsing documentList: {str(e)}" - - async def _resolveSitesFromPathQuery(self, pathQuery: str) -> tuple[List[Dict[str, Any]], Optional[str]]: - """ - Resolve sites from pathQuery using SharePoint service helper methods. - - Parameters: - pathQuery (str): Path query string - - Returns: - tuple: (sites, errorMessage) - - sites: List of site dictionaries - - errorMessage: Error message if resolution failed, None otherwise - """ - try: - # Validate pathQuery format - isValid, errorMsg = self.services.sharepoint.validatePathQuery(pathQuery) - if not isValid: - return [], errorMsg - - # Resolve sites using service helper - sites = await self.services.sharepoint.resolveSitesFromPathQuery(pathQuery) - if not sites: - return [], "No SharePoint sites found or accessible" - - return sites, None - except Exception as e: - logger.error(f"Error resolving sites from pathQuery '{pathQuery}': {str(e)}") - return [], f"Error resolving sites from pathQuery: {str(e)}" - - - @action - async def findDocumentPath(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Find documents and folders by name/path across sites. - - Input requirements: connectionReference (required); searchQuery (required); optional site, maxResults. - - Output format: JSON with found items and paths. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - site (str, optional): Site hint. - - searchQuery (str, required): Search terms or path. - - maxResults (int, optional): Maximum items to return. Default: 1000. - """ - import time - operationId = None - try: - # Init progress logger - workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" - operationId = f"sharepoint_find_{workflowId}_{int(time.time())}" - - # Start progress tracking - parentOperationId = parameters.get('parentOperationId') - self.services.chat.progressLogStart( - operationId, - "Find Document Path", - "SharePoint Search", - f"Query: {parameters.get('searchQuery', '*')}", - parentOperationId=parentOperationId - ) - - connectionReference = parameters.get("connectionReference") - site = parameters.get("site") - searchQuery = parameters.get("searchQuery", "*") - maxResults = parameters.get("maxResults", 1000) - - if not connectionReference: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Connection reference is required") - - # Parse searchQuery to extract path, search terms, search type, and options - pathQuery, fileQuery, searchType, searchOptions = self._parseSearchQuery(searchQuery) - logger.debug(f"Parsed searchQuery '{searchQuery}' -> pathQuery='{pathQuery}', fileQuery='{fileQuery}', searchType='{searchType}'") - - self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection") - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Extract site name from pathQuery if it contains Microsoft-standard path (/sites/SiteName/...) - siteFromPath = None - directSite = None - if pathQuery and pathQuery.startswith('/sites/'): - parsedPath = self._extractSiteFromStandardPath(pathQuery) - if parsedPath: - siteFromPath = parsedPath.get("siteName") - logger.info(f"Extracted site from Microsoft-standard pathQuery '{pathQuery}': '{siteFromPath}'") - - # Try to get site directly by path (optimization - no need to load all 60 sites) - directSite = await self._getSiteByStandardPath(siteFromPath) - if directSite: - logger.info(f"Got site directly by standard path - no need to discover all sites") - sites = [directSite] - else: - logger.warning(f"Could not get site directly, falling back to site discovery") - directSite = None - else: - logger.warning(f"Failed to parse site from standard pathQuery '{pathQuery}'") - - # If we didn't get the site directly, use discovery and filtering - if not directSite: - # Determine which site hint to use (priority: site parameter > site from pathQuery > site_hint from searchOptions) - siteHintToUse = site or siteFromPath or searchOptions.get("site_hint") - - # Discover SharePoint sites - use targeted approach when site hint is available - self.services.chat.progressLogUpdate(operationId, 0.3, "Discovering SharePoint sites") - if siteHintToUse: - # When site hint is available, discover all sites first, then filter - allSites = await self._discoverSharePointSites() - if not allSites: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No SharePoint sites found or accessible") - - sites = self._filterSitesByHint(allSites, siteHintToUse) - logger.info(f"Filtered sites by site hint '{siteHintToUse}' -> {len(sites)} sites") - if not sites: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=f"No SharePoint sites found matching '{siteHintToUse}'") - else: - # No site hint - discover all sites - sites = await self._discoverSharePointSites() - if not sites: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No SharePoint sites found or accessible") - - # Resolve path query into search paths - searchPaths = self._resolvePathQuery(pathQuery) - - self.services.chat.progressLogUpdate(operationId, 0.5, f"Searching across {len(sites)} site(s)") - - try: - # Search across all discovered sites - foundDocuments = [] - allSitesSearched = [] - - # Handle different search approaches based on search type - if searchType == "folders" and fileQuery and fileQuery.strip() != "" and fileQuery.strip() != "*": - # Use unified search for folders - this is global and searches all sites - try: - - # Use Microsoft Graph Search API syntax (simple term search only) - terms = [t for t in fileQuery.split() if t.strip()] - - if len(terms) > 1: - # Multiple terms: search for ALL terms (AND) - more specific results - queryString = " AND ".join(terms) - else: - # Single term: search for the term - queryString = terms[0] if terms else fileQuery - logger.info(f"Using unified search for folders: {queryString}") - - payload = { - "requests": [ - { - "entityTypes": ["driveItem"], - "query": {"queryString": queryString}, - "from": 0, - "size": 50 - } - ] - } - logger.info(f"Using unified search API for folders with queryString: {queryString}") - - # Use global search endpoint (site-specific search not available) - unifiedResult = await self._makeGraphApiCall( - "search/query", - method="POST", - data=json.dumps(payload).encode("utf-8") - ) - - if "error" in unifiedResult: - logger.warning(f"Unified search failed: {unifiedResult['error']}") - items = [] - else: - # Flatten hits -> driveItem resources - items = [] - for container in (unifiedResult.get("value", []) or []): - for hitsContainer in (container.get("hitsContainers", []) or []): - for hit in (hitsContainer.get("hits", []) or []): - resource = hit.get("resource") - if resource: - items.append(resource) - - logger.info(f"Unified search returned {len(items)} items (pre-filter)") - - # Apply our improved folder detection logic - folderItems = [] - for item in items: - resource = item - - # Use the same detection logic as our test - isFolder = self.services.sharepoint.detectFolderType(resource) - - if isFolder: - folderItems.append(item) - - items = folderItems - logger.info(f"Filtered to {len(items)} folders using improved detection logic") - - # Process unified search results - extract site information from webUrl - for item in items: - itemName = item.get("name", "") - webUrl = item.get("webUrl", "") - - # Extract site information from webUrl - siteName = "Unknown Site" - siteId = "unknown" - - if webUrl and '/sites/' in webUrl: - try: - # Extract site name from URL: https://pcuster.sharepoint.com/sites/SiteName/... - urlParts = webUrl.split('/sites/') - if len(urlParts) > 1: - sitePath = urlParts[1].split('/')[0] - # Find matching site from discovered sites - # First try to match by site name (URL path) - for site in sites: - if site.get("name") == sitePath: - siteName = site.get("displayName", sitePath) - siteId = site.get("id", "unknown") - break - else: - # If no match by name, try to match by displayName - for site in sites: - if site.get("displayName") == sitePath: - siteName = site.get("displayName", sitePath) - siteId = site.get("id", "unknown") - break - else: - # If no exact match, use the site path as site name - siteName = sitePath - # Try to find a site with similar name - for site in sites: - if sitePath.lower() in site.get("name", "").lower() or sitePath.lower() in site.get("displayName", "").lower(): - siteName = site.get("displayName", sitePath) - siteId = site.get("id", "unknown") - break - except Exception as e: - logger.warning(f"Error extracting site info from URL {webUrl}: {e}") - - # Use improved folder detection logic - isFolder = self.services.sharepoint.detectFolderType(item) - itemType = "folder" if isFolder else "file" - itemPath = item.get("parentReference", {}).get("path", "") - logger.debug(f"Processing {itemType}: '{itemName}' at path: '{itemPath}'") - - # Simple filtering like test file - just check search type - if searchType == "files" and isFolder: - continue # Skip folders when searching for files - elif searchType == "folders" and not isFolder: - continue # Skip files when searching for folders - - # Simple approach like test file - no complex filtering - logger.debug(f"Item '{itemName}' found - adding to results") - - # Create result with full path information for proper action chaining - parentPath = item.get("parentReference", {}).get("path", "") - - # Extract the full SharePoint path from webUrl or parentReference - fullPath = "" - if webUrl: - # Extract path from webUrl: https://pcuster.sharepoint.com/sites/SSSRESYNachfolge/Freigegebene%20Dokumente/General/Eskalation%20LogObject/Druckersteuerung - if '/sites/' in webUrl: - pathPart = webUrl.split('/sites/')[1] - # Decode URL encoding and convert to backslash format - decodedPath = urllib.parse.unquote(pathPart) - fullPath = "\\" + decodedPath.replace('/', '\\') - elif parentPath: - # Use parentReference path if available - fullPath = parentPath.replace('/', '\\') - - docInfo = { - "id": item.get("id"), - "name": item.get("name"), - "type": "folder" if isFolder else "file", - "siteName": siteName, - "siteId": siteId, - "webUrl": webUrl, - "fullPath": fullPath, - "parentPath": parentPath - } - - foundDocuments.append(docInfo) - - logger.info(f"Found {len(foundDocuments)} documents from unified search") - - except Exception as e: - logger.error(f"Error performing unified folder search: {str(e)}") - # Fallback to site-by-site search - pass - - # If no unified search was performed or it failed, fall back to site-by-site search - if not foundDocuments: - # Use simple approach like test file - no complex filtering - siteScopedSites = sites - - for site in siteScopedSites: - siteId = site["id"] - siteName = site["displayName"] - siteUrl = site["webUrl"] - - logger.info(f"Searching in site: {siteName} ({siteUrl})") - - # Check if pathQuery contains a specific folder path (not just /sites/SiteName) - folderPath = None - if pathQuery and pathQuery.startswith('/sites/'): - parsedPath = self._extractSiteFromStandardPath(pathQuery) - if parsedPath: - innerPath = parsedPath.get("innerPath", "") - if innerPath and innerPath.strip(): - # Remove leading slash if present - folderPath = innerPath.lstrip('/') - - # Generic approach: Try to find the folder, if it fails, remove first segment - # This works for all languages because we test the actual API response - # In SharePoint Graph API, /drive/root already points to the default document library, - # so library names in paths should be removed - pathSegments = [s for s in folderPath.split('/') if s.strip()] - if len(pathSegments) > 1: - # Try with first segment removed (first segment is likely the document library) - testPath = '/'.join(pathSegments[1:]) - # Quick test: try to get folder info (this is fast and doesn't require full search) - testEndpoint = f"sites/{siteId}/drive/root:/{urllib.parse.quote(testPath, safe='')}:" - testResult = await self._makeGraphApiCall(testEndpoint) - if testResult and "error" not in testResult: - # Path without first segment works - first segment was likely the document library - folderPath = testPath - logger.info(f"Removed document library name '{pathSegments[0]}' from folder path (tested via API)") - else: - # Keep original path - first segment is not a document library - logger.info(f"Keeping original folder path '{folderPath}' (first segment is not a document library)") - elif len(pathSegments) == 1: - # Only one segment - likely the document library itself, use root - folderPath = None - logger.info(f"Only one segment '{pathSegments[0]}' found, likely document library - using root") - - if folderPath: - logger.info(f"Extracted folder path from pathQuery: '{folderPath}'") - else: - logger.info(f"Folder path resolved to root (only document library in path)") - - # Use Microsoft Graph API for this specific site - # Handle empty or wildcard queries - if not fileQuery or fileQuery.strip() == "" or fileQuery.strip() == "*": - # For wildcard/empty queries, list all items - if folderPath: - # List items in specific folder - encodedPath = urllib.parse.quote(folderPath, safe='') - endpoint = f"sites/{siteId}/drive/root:/{encodedPath}:/children" - logger.info(f"Listing items in folder: '{folderPath}'") - else: - # List all items in the drive root - endpoint = f"sites/{siteId}/drive/root/children" - - # Make the API call to list items - listResult = await self._makeGraphApiCall(endpoint) - if "error" in listResult: - logger.warning(f"List failed for site {siteName}: {listResult['error']}") - continue - # Process list results for this site - items = listResult.get("value", []) - logger.info(f"Retrieved {len(items)} items from site {siteName}") - else: - # For files, use regular search API - # Clean the query: remove path-like syntax and invalid KQL syntax - searchQuery = self._cleanSearchQuery(fileQuery) - # URL-encode the query parameter - encodedQuery = urllib.parse.quote(searchQuery, safe='') - - if folderPath: - # Search in specific folder - encodedPath = urllib.parse.quote(folderPath, safe='') - endpoint = f"sites/{siteId}/drive/root:/{encodedPath}:/search(q='{encodedQuery}')" - logger.info(f"Searching in folder '{folderPath}' with query: '{searchQuery}' (encoded: '{encodedQuery}')") - else: - # Search in drive root - endpoint = f"sites/{siteId}/drive/root/search(q='{encodedQuery}')" - logger.info(f"Using search API for files with query: '{searchQuery}' (encoded: '{encodedQuery}')") - - # Make the search API call (files) - searchResult = await self._makeGraphApiCall(endpoint) - if "error" in searchResult: - logger.warning(f"Search failed for site {siteName}: {searchResult['error']}") - continue - # Process search results for this site (files) - items = searchResult.get("value", []) - logger.info(f"Retrieved {len(items)} items from site {siteName}") - - siteDocuments = [] - - for item in items: - itemName = item.get("name", "") - - # Use improved folder detection logic - isFolder = self.services.sharepoint.detectFolderType(item) - - itemType = "folder" if isFolder else "file" - itemPath = item.get("parentReference", {}).get("path", "") - logger.debug(f"Processing {itemType}: '{itemName}' at path: '{itemPath}'") - - # Simple filtering like test file - just check search type - if searchType == "files" and isFolder: - continue # Skip folders when searching for files - elif searchType == "folders" and not isFolder: - continue # Skip files when searching for folders - - # Simple approach like test file - no complex filtering - logger.debug(f"Item '{itemName}' found - adding to results") - - # Create result with full path information for proper action chaining - webUrl = item.get("webUrl", "") - parentPath = item.get("parentReference", {}).get("path", "") - - # Extract the full SharePoint path from webUrl or parentReference - fullPath = "" - if webUrl: - # Extract path from webUrl: https://pcuster.sharepoint.com/sites/SSSRESYNachfolge/Freigegebene%20Dokumente/General/Eskalation%20LogObject/Druckersteuerung - if '/sites/' in webUrl: - pathPart = webUrl.split('/sites/')[1] - # Decode URL encoding and convert to backslash format - decodedPath = urllib.parse.unquote(pathPart) - fullPath = "\\" + decodedPath.replace('/', '\\') - elif parentPath: - # Use parentReference path if available - fullPath = parentPath.replace('/', '\\') - - docInfo = { - "id": item.get("id"), - "name": item.get("name"), - "type": "folder" if isFolder else "file", - "siteName": siteName, - "siteId": siteId, - "webUrl": webUrl, - "fullPath": fullPath, - "parentPath": parentPath - } - - siteDocuments.append(docInfo) - - foundDocuments.extend(siteDocuments) - allSitesSearched.append({ - "siteName": siteName, - "siteUrl": siteUrl, - "siteId": siteId, - "documentsFound": len(siteDocuments) - }) - - logger.info(f"Found {len(siteDocuments)} documents in site {siteName}") - - # Limit total results to maxResults - if len(foundDocuments) > maxResults: - foundDocuments = foundDocuments[:maxResults] - logger.info(f"Limited results to {maxResults} items") - - self.services.chat.progressLogUpdate(operationId, 0.9, f"Found {len(foundDocuments)} document(s)") - - resultData = { - "searchQuery": searchQuery, - "totalResults": len(foundDocuments), - "maxResults": maxResults, - "foundDocuments": foundDocuments, - "timestamp": self.services.utils.timestampGetUtc() - } - - except Exception as e: - logger.error(f"Error searching SharePoint: {str(e)}") - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=str(e)) - - # Use default JSON format for output - outputExtension = ".json" # Default - outputMimeType = "application/json" # Default - - validationMetadata = { - "actionType": "sharepoint.findDocumentPath", - "searchQuery": searchQuery, - "maxResults": maxResults, - "totalResults": len(foundDocuments), - "hasResults": len(foundDocuments) > 0 - } - - self.services.chat.progressLogFinish(operationId, True) - return ActionResult( - success=True, - documents=[ - ActionDocument( - documentName=f"sharepoint_find_path_{self._format_timestamp_for_filename()}{outputExtension}", - documentData=json.dumps(resultData, indent=2), - mimeType=outputMimeType, - validationMetadata=validationMetadata - ) - ] - ) - - except Exception as e: - logger.error(f"Error finding document path: {str(e)}") - if operationId: - try: - self.services.chat.progressLogFinish(operationId, False) - except: - pass - return ActionResult.isFailure(error=str(e)) - - @action - async def readDocuments(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Read documents from SharePoint and extract content/metadata. - - Input requirements: connectionReference (required); documentList or pathQuery (required); includeMetadata (optional). - - Output format: Standardized ActionDocument format (documentName, documentData, mimeType). - - Binary files (PDFs, etc.) are Base64-encoded in documentData. - - Text files are stored as plain text in documentData. - - Returns ActionResult with documents list for template processing. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - documentList (list, optional): Document list reference(s) containing findDocumentPath result. - - pathQuery (str, optional): Direct path query if no documentList (e.g., /sites/SiteName/FolderPath). - - includeMetadata (bool, optional): Include metadata. Default: True. - - Returns: - - ActionResult with documents: List[ActionDocument] where each ActionDocument contains: - - documentName: File name - - documentData: Base64-encoded content (binary files) or plain text (text files) - - mimeType: MIME type (e.g., application/pdf, text/plain) - """ - import time - operationId = None - try: - # Init progress logger - workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" - operationId = f"sharepoint_read_{workflowId}_{int(time.time())}" - - # Start progress tracking - parentOperationId = parameters.get('parentOperationId') - self.services.chat.progressLogStart( - operationId, - "Read Documents", - "SharePoint Document Reading", - "Processing document list", - parentOperationId=parentOperationId - ) - - documentList = parameters.get("documentList") - pathQuery = parameters.get("pathQuery", "*") - connectionReference = parameters.get("connectionReference") - includeMetadata = parameters.get("includeMetadata", True) - - # Validate connection reference - if not connectionReference: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Connection reference is required") - - # Require either documentList or pathQuery - if not documentList and (not pathQuery or pathQuery.strip() == "" or pathQuery.strip() == "*"): - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Either documentList or pathQuery is required") - - # Get connection first - self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection") - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Parse documentList to extract foundDocuments and site information - sharePointFileIds = None - sites = None - - if documentList: - foundDocuments, sites, errorMsg = await self._parseDocumentListForFoundDocuments(documentList) - if errorMsg: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=errorMsg) - - if foundDocuments: - # Extract SharePoint file IDs from foundDocuments - sharePointFileIds = [doc.get("id") for doc in foundDocuments if doc.get("type") == "file"] - if not sharePointFileIds: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No files found in documentList from findDocumentPath result") - logger.info(f"Extracted {len(sharePointFileIds)} SharePoint file IDs from documentList") - - # If we have SharePoint file IDs from documentList (findDocumentPath result), read them directly - if sharePointFileIds and sites: - # Read SharePoint files directly using their IDs - readResults = [] - siteId = sites[0]['id'] - - self.services.chat.progressLogUpdate(operationId, 0.5, f"Reading {len(sharePointFileIds)} file(s) from SharePoint") - for idx, fileId in enumerate(sharePointFileIds): - try: - self.services.chat.progressLogUpdate(operationId, 0.5 + (idx * 0.3 / len(sharePointFileIds)), f"Reading file {idx + 1}/{len(sharePointFileIds)}") - # Get file info from SharePoint - endpoint = f"sites/{siteId}/drive/items/{fileId}" - fileInfo = await self._makeGraphApiCall(endpoint) - - if "error" in fileInfo: - logger.warning(f"Failed to get file info for {fileId}: {fileInfo['error']}") - continue - - # Get file content using SharePoint service (handles binary data correctly) - fileName = fileInfo.get("name", f"file_{fileId}") - fileContent = await self.services.sharepoint.downloadFile(siteId, fileId) - - # Create result document - resultItem = { - "fileId": fileId, - "fileName": fileName, - "sharepointFileId": fileId, - "siteName": sites[0]['displayName'], - "siteUrl": sites[0]['webUrl'], - "size": fileInfo.get("size", 0), - "createdDateTime": fileInfo.get("createdDateTime"), - "lastModifiedDateTime": fileInfo.get("lastModifiedDateTime"), - "webUrl": fileInfo.get("webUrl") - } - - # Add content if available - if fileContent: - resultItem["content"] = fileContent - - # Add metadata if requested - if includeMetadata: - resultItem["metadata"] = { - "mimeType": fileInfo.get("file", {}).get("mimeType"), - "downloadUrl": fileInfo.get("@microsoft.graph.downloadUrl"), - "createdBy": fileInfo.get("createdBy", {}), - "lastModifiedBy": fileInfo.get("lastModifiedBy", {}), - "parentReference": fileInfo.get("parentReference", {}) - } - - readResults.append(resultItem) - except Exception as e: - logger.error(f"Error reading file {fileId}: {str(e)}") - continue - - if not readResults: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No files could be read from documentList") - - # Convert read results to ActionDocument objects - # IMPORTANT: For binary files (PDFs), store Base64-encoded content directly in documentData - # The system will create FileData and ChatDocument automatically - self.services.chat.progressLogUpdate(operationId, 0.8, f"Processing {len(readResults)} document(s)") - from modules.datamodels.datamodelChat import ActionDocument - import base64 - - actionDocuments = [] - for resultItem in readResults: - fileContent = resultItem.get("content") - fileName = resultItem.get("fileName", f"file_{resultItem.get('fileId')}") - - # Determine MIME type from metadata or file extension - mimeType = "application/octet-stream" - if resultItem.get("metadata", {}).get("mimeType"): - mimeType = resultItem["metadata"]["mimeType"] - elif fileName: - if fileName.endswith('.pdf'): - mimeType = "application/pdf" - elif fileName.endswith('.txt'): - mimeType = "text/plain" - elif fileName.endswith('.json'): - mimeType = "application/json" - - # For binary files (PDFs, etc.), store Base64-encoded content directly - # The GenerationService will detect PDF mimeType and handle base64 decoding - if fileContent and isinstance(fileContent, bytes): - # Encode binary content as Base64 string - base64Content = base64.b64encode(fileContent).decode('utf-8') - validationMetadata = { - "actionType": "sharepoint.readDocuments", - "fileName": fileName, - "sharepointFileId": resultItem.get("sharepointFileId"), - "siteName": resultItem.get("siteName"), - "mimeType": mimeType, - "contentType": "binary", - "size": len(fileContent), - "includeMetadata": includeMetadata - } - actionDoc = ActionDocument( - documentName=fileName, - documentData=base64Content, # Base64 string for binary files - mimeType=mimeType, - validationMetadata=validationMetadata - ) - actionDocuments.append(actionDoc) - logger.info(f"Stored binary file {fileName} ({len(fileContent)} bytes) as Base64 in ActionDocument") - elif fileContent: - # Text content - store directly in documentData - validationMetadata = { - "actionType": "sharepoint.readDocuments", - "fileName": fileName, - "sharepointFileId": resultItem.get("sharepointFileId"), - "siteName": resultItem.get("siteName"), - "mimeType": mimeType, - "contentType": "text", - "includeMetadata": includeMetadata - } - actionDoc = ActionDocument( - documentName=fileName, - documentData=fileContent if isinstance(fileContent, str) else str(fileContent), - mimeType=mimeType, - validationMetadata=validationMetadata - ) - actionDocuments.append(actionDoc) - else: - # No content - store metadata only - docData = { - "fileName": fileName, - "sharepointFileId": resultItem.get("sharepointFileId"), - "siteName": resultItem.get("siteName"), - "siteUrl": resultItem.get("siteUrl"), - "size": resultItem.get("size"), - "createdDateTime": resultItem.get("createdDateTime"), - "lastModifiedDateTime": resultItem.get("lastModifiedDateTime"), - "webUrl": resultItem.get("webUrl") - } - if resultItem.get("metadata"): - docData["metadata"] = resultItem["metadata"] - - validationMetadata = { - "actionType": "sharepoint.readDocuments", - "fileName": fileName, - "sharepointFileId": resultItem.get("sharepointFileId"), - "siteName": resultItem.get("siteName"), - "mimeType": mimeType, - "contentType": "metadata_only", - "includeMetadata": includeMetadata - } - actionDoc = ActionDocument( - documentName=fileName, - documentData=json.dumps(docData, indent=2), - mimeType=mimeType, - validationMetadata=validationMetadata - ) - actionDocuments.append(actionDoc) - - # Return success with action documents - self.services.chat.progressLogUpdate(operationId, 0.9, f"Read {len(actionDocuments)} document(s)") - self.services.chat.progressLogFinish(operationId, True) - return ActionResult.isSuccess(documents=actionDocuments) - - # If no sites from documentList, try pathQuery fallback - if not sites and pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*": - sites, errorMsg = await self._resolveSitesFromPathQuery(pathQuery) - if errorMsg: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=errorMsg) - - # If still no sites, return error - if not sites: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Either documentList must contain findDocumentPath result with file information, or pathQuery must be provided. Use findDocumentPath first to get file paths, or provide pathQuery directly.") - - # This should never be reached if logic above is correct - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Unexpected error: could not process documentList or pathQuery") - except Exception as e: - logger.error(f"Error reading SharePoint documents: {str(e)}") - if operationId: - try: - self.services.chat.progressLogFinish(operationId, False) - except: - pass # Don't fail on progress logging errors - return ActionResult( - success=False, - error=str(e) - ) - - @action - async def uploadDocument(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Upload documents to SharePoint. Only to choose this action with a connectionReference - - Input requirements: connectionReference (required); documentList (required); pathQuery (optional). - - Output format: JSON with upload status and file info. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - documentList (list, required): Document reference(s) to upload. File names are taken from the documents. - - pathQuery (str, optional): Direct upload target path if documentList doesn't contain findDocumentPath result (e.g., /sites/SiteName/FolderPath). - """ - import time - operationId = None - try: - # Init progress logger - workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" - operationId = f"sharepoint_upload_{workflowId}_{int(time.time())}" - - # Start progress tracking - parentOperationId = parameters.get('parentOperationId') - self.services.chat.progressLogStart( - operationId, - "Upload Document", - "SharePoint Upload", - "Processing document list", - parentOperationId=parentOperationId - ) - - connectionReference = parameters.get("connectionReference") - documentList = parameters.get("documentList") - pathQuery = parameters.get("pathQuery") - if isinstance(documentList, str): - documentList = [documentList] - - if not connectionReference: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Connection reference is required") - - if not documentList: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Document list is required") - - # Parse documentList to extract folder path and site information - uploadPath, sites, filesToUpload, errorMsg = await self._parseDocumentListForFolder(documentList) - if errorMsg: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=errorMsg) - - # If no folder path found from documentList, use pathQuery if provided - if not uploadPath and pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*": - uploadPath = pathQuery - logger.info(f"Using pathQuery for upload path: {uploadPath}") - # Resolve sites from pathQuery - sites, errorMsg = await self._resolveSitesFromPathQuery(pathQuery) - if errorMsg: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=errorMsg) - - # Validate required parameters - if not uploadPath: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Either documentList must contain findDocumentPath result with folder information, or pathQuery must be provided. Use findDocumentPath first to get upload folder, or provide pathQuery directly.") - - if not sites: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Site information missing. Cannot determine target site for upload.") - - if not filesToUpload: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No files to upload found in documentList.") - - # Get connection - self.services.chat.progressLogUpdate(operationId, 0.3, "Getting Microsoft connection") - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Process upload paths - uploadPaths = [] - if uploadPath.startswith('01PPXICCB') or uploadPath.startswith('01'): - # It's a folder ID - use it directly - uploadPaths = [uploadPath] - logger.info(f"Using folder ID directly for upload: {uploadPath}") - else: - # It's a path - resolve it normally - uploadPaths = self._resolvePathQuery(uploadPath) - - # Process each document upload - uploadResults = [] - - # Extract file names from documents - fileNames = [doc.fileName for doc in filesToUpload] - logger.info(f"Using file names from documentList: {fileNames}") - - self.services.chat.progressLogUpdate(operationId, 0.5, f"Uploading {len(filesToUpload)} document(s)") - - # Process upload paths - - # Process each document upload - uploadResults = [] - - # Extract file names from documents - fileNames = [doc.fileName for doc in filesToUpload] - logger.info(f"Using file names from documentList: {fileNames}") - - self.services.chat.progressLogUpdate(operationId, 0.5, f"Uploading {len(filesToUpload)} document(s)") - - for i, (chatDocument, fileName) in enumerate(zip(filesToUpload, fileNames)): - try: - fileId = chatDocument.fileId - fileData = self.services.chat.getFileData(fileId) - - if not fileData: - logger.warning(f"File data not found for fileId: {fileId}") - uploadResults.append({ - "fileName": fileName, - "fileId": fileId, - "error": "File data not found", - "uploadStatus": "failed" - }) - continue - - # Upload to the first available site (or could be made configurable) - uploadSuccessful = False - - for site in sites: - siteId = site["id"] - siteName = site["displayName"] - siteUrl = site["webUrl"] - - # Use the first upload path or default to Documents - uploadPath = uploadPaths[0] if uploadPaths else "/Documents" - - # Handle wildcard paths - replace with default Documents folder - if uploadPath == "*": - uploadPath = "/Documents" - logger.warning(f"Wildcard path '*' detected, using default '/Documents' folder for upload") - - # Check if uploadPath is a folder ID or a regular path - if uploadPath.startswith('01PPXICCB') or uploadPath.startswith('01'): - # It's a folder ID - use the folder-specific upload endpoint - uploadEndpoint = f"sites/{siteId}/drive/items/{uploadPath}:/{fileName}:/content" - logger.info(f"Using folder ID upload endpoint: {uploadEndpoint}") - else: - # It's a regular path - use the root-based upload endpoint - uploadPath = uploadPath.rstrip('/') + '/' + fileName - uploadPathClean = uploadPath.lstrip('/') - uploadEndpoint = f"sites/{siteId}/drive/root:/{uploadPathClean}:/content" - logger.info(f"Using path-based upload endpoint: {uploadEndpoint}") - - # Upload endpoint for small files (< 4MB) - if len(fileData) < 4 * 1024 * 1024: # 4MB - - # Upload the file - uploadResult = await self._makeGraphApiCall( - uploadEndpoint, - method="PUT", - data=fileData - ) - - if "error" not in uploadResult: - uploadResults.append({ - "fileName": fileName, - "fileId": fileId, - "uploadStatus": "success", - "siteName": siteName, - "siteUrl": siteUrl, - "uploadPath": uploadPath, - "uploadEndpoint": uploadEndpoint, - "sharepointFileId": uploadResult.get("id"), - "webUrl": uploadResult.get("webUrl"), - "size": uploadResult.get("size"), - "createdDateTime": uploadResult.get("createdDateTime") - }) - uploadSuccessful = True - break - else: - logger.warning(f"Upload failed to site {siteName}: {uploadResult['error']}") - else: - # For large files, we would need to implement resumable upload - logger.warning(f"File too large ({len(fileData)} bytes) for site {siteName}") - continue - - if not uploadSuccessful: - uploadResults.append({ - "fileName": fileName, - "fileId": fileId, - "error": f"File too large ({len(fileData)} bytes) or upload failed to all sites. Files larger than 4MB require resumable upload (not implemented).", - "uploadStatus": "failed" - }) - - except Exception as e: - logger.error(f"Error uploading document {fileName}: {str(e)}") - uploadResults.append({ - "fileName": fileName, - "fileId": fileId, - "error": str(e), - "uploadStatus": "failed" - }) - - # Update progress for each file - self.services.chat.progressLogUpdate(operationId, 0.5 + (i * 0.4 / len(filesToUpload)), f"Uploaded {i + 1}/{len(filesToUpload)} file(s)") - - # Create result data - resultData = { - "connectionReference": connectionReference, - "uploadPath": uploadPath, - "documentList": documentList, - "fileNames": fileNames, - "sitesAvailable": len(sites), - "uploadResults": uploadResults, - "connection": { - "id": connection["id"], - "authority": "microsoft", - "reference": connectionReference - }, - "timestamp": self.services.utils.timestampGetUtc() - } - - # Use default JSON format for output - outputExtension = ".json" # Default - outputMimeType = "application/json" # Default - - validationMetadata = { - "actionType": "sharepoint.uploadDocument", - "connectionReference": connectionReference, - "uploadPath": uploadPath, - "fileNames": fileNames, - "uploadCount": len(uploadResults), - "successfulUploads": len([r for r in uploadResults if r.get("uploadStatus") == "success"]), - "failedUploads": len([r for r in uploadResults if r.get("uploadStatus") == "failed"]) - } - - successfulUploads = len([r for r in uploadResults if r.get("uploadStatus") == "success"]) - self.services.chat.progressLogUpdate(operationId, 0.9, f"Uploaded {successfulUploads}/{len(uploadResults)} file(s)") - self.services.chat.progressLogFinish(operationId, successfulUploads > 0) - - return ActionResult( - success=True, - documents=[ - ActionDocument( - documentName=f"sharepoint_upload_{self._format_timestamp_for_filename()}{outputExtension}", - documentData=json.dumps(resultData, indent=2), - mimeType=outputMimeType, - validationMetadata=validationMetadata - ) - ] - ) - - except Exception as e: - logger.error(f"Error uploading to SharePoint: {str(e)}") - if operationId: - try: - self.services.chat.progressLogFinish(operationId, False) - except: - pass - return ActionResult( - success=False, - error=str(e) - ) - - @action - async def listDocuments(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: List documents and folders in SharePoint paths across sites. - - Input requirements: connectionReference (required); documentList (required); includeSubfolders (optional). - - Output format: JSON with folder items and metadata. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - documentList (list, required): Document list reference(s) containing findDocumentPath result. - - includeSubfolders (bool, optional): Include one level of subfolders. Default: False. - """ - import time - operationId = None - try: - # Init progress logger - workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" - operationId = f"sharepoint_list_{workflowId}_{int(time.time())}" - - # Start progress tracking - parentOperationId = parameters.get('parentOperationId') - self.services.chat.progressLogStart( - operationId, - "List Documents", - "SharePoint Listing", - "Processing document list", - parentOperationId=parentOperationId - ) - - connectionReference = parameters.get("connectionReference") - documentList = parameters.get("documentList") - pathQuery = parameters.get("pathQuery", "*") - if isinstance(documentList, str): - documentList = [documentList] - includeSubfolders = parameters.get("includeSubfolders", False) # Default to False for better UX - - if not connectionReference: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Connection reference is required") - - # Require either documentList or pathQuery - if not documentList and (not pathQuery or pathQuery.strip() == "" or pathQuery.strip() == "*"): - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Either documentList or pathQuery is required") - - # Parse documentList to extract folder path and site information - listQuery, sites, _, errorMsg = await self._parseDocumentListForFolder(documentList) - if errorMsg: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=errorMsg) - - # If no folder path found from documentList, use pathQuery if provided - if not listQuery and pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*": - listQuery = pathQuery - logger.info(f"Using pathQuery for list query: {listQuery}") - # Resolve sites from pathQuery - sites, errorMsg = await self._resolveSitesFromPathQuery(pathQuery) - if errorMsg: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=errorMsg) - - # Validate required parameters - if not listQuery: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Either documentList must contain findDocumentPath result with folder information, or pathQuery must be provided. Use findDocumentPath first to get folder path, or provide pathQuery directly.") - - if not sites: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Site information missing. Cannot determine target site for list operation.") - - # Get connection - self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection") - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - logger.info(f"Starting SharePoint listDocuments for listQuery: {listQuery}") - logger.debug(f"Connection ID: {connection['id']}") - - self.services.chat.progressLogUpdate(operationId, 0.3, "Processing folder path") - - # Parse listQuery to extract path, search terms, search type, and options - pathQuery, fileQuery, searchType, searchOptions = self._parseSearchQuery(listQuery) - - # Check if listQuery is a folder ID (starts with 01PPXICCB...) - if listQuery.startswith('01PPXICCB') or listQuery.startswith('01'): - # Direct folder ID - use it directly - folderPaths = [listQuery] - logger.info(f"Using direct folder ID: {listQuery}") - else: - # Remove site prefix from pathQuery before resolving (it's only for site filtering) - pathQueryForResolve = pathQuery - # Microsoft-standard path: /sites/SiteName/Path -> /Path - if pathQuery.startswith('/sites/'): - parsedPath = self._extractSiteFromStandardPath(pathQuery) - if parsedPath: - innerPath = parsedPath.get("innerPath", "") - pathQueryForResolve = '/' + innerPath if innerPath else '/' - else: - pathQueryForResolve = '/' - - # Remove first path segment if it looks like a document library name - # In SharePoint Graph API, /drive/root already points to the default document library, - # so library names in paths should be removed - # Generic approach: if path has multiple segments, store original for fallback - pathSegments = [s for s in pathQueryForResolve.split('/') if s.strip()] - if len(pathSegments) > 1: - # Path has multiple segments - first might be a library name - # Store original for potential fallback - originalPath = pathQueryForResolve - # Try without first segment (assuming it's a library name) - pathQueryForResolve = '/' + '/'.join(pathSegments[1:]) - logger.info(f"Removed first path segment (potential library name), path changed from '{originalPath}' to '{pathQueryForResolve}'") - elif len(pathSegments) == 1: - # Only one segment - if it's a common library-like name, use root - firstSegmentLower = pathSegments[0].lower() - libraryIndicators = ['document', 'dokument', 'shared', 'freigegeben', 'library', 'bibliothek'] - if any(indicator in firstSegmentLower for indicator in libraryIndicators): - pathQueryForResolve = '/' - logger.info(f"First segment '{pathSegments[0]}' appears to be a library name, using root") - - # Resolve path query into folder paths - folderPaths = self._resolvePathQuery(pathQueryForResolve) - logger.info(f"Resolved folder paths: {folderPaths}") - - # Process each folder path across all sites - listResults = [] - - self.services.chat.progressLogUpdate(operationId, 0.5, f"Listing {len(folderPaths)} folder(s) across {len(sites)} site(s)") - - for folderPath in folderPaths: - try: - folderResults = [] - - for site in sites: - siteId = site["id"] - siteName = site["displayName"] - siteUrl = site["webUrl"] - - logger.info(f"Listing folder {folderPath} in site: {siteName}") - - # Determine the endpoint based on folder path - if folderPath in ["/", ""] or folderPath == "*": - # Root folder - endpoint = f"sites/{siteId}/drive/root/children" - elif folderPath.startswith('01PPXICCB') or folderPath.startswith('01'): - # Direct folder ID - endpoint = f"sites/{siteId}/drive/items/{folderPath}/children" - else: - # Specific folder path - remove leading slash if present and URL encode - folderPathClean = folderPath.lstrip('/') - # URL encode the path for Graph API (spaces and special characters need encoding) - folderPathEncoded = urllib.parse.quote(folderPathClean, safe='/') - endpoint = f"sites/{siteId}/drive/root:/{folderPathEncoded}:/children" - - # Make the API call to list folder contents - apiResult = await self._makeGraphApiCall(endpoint) - - if "error" in apiResult: - logger.warning(f"Failed to list folder {folderPath} in site {siteName}: {apiResult['error']}") - continue - - # Process the results - items = apiResult.get("value", []) - processedItems = [] - - for item in items: - # Use improved folder detection logic - isFolder = self.services.sharepoint.detectFolderType(item) - - itemInfo = { - "id": item.get("id"), - "name": item.get("name"), - "size": item.get("size", 0), - "createdDateTime": item.get("createdDateTime"), - "lastModifiedDateTime": item.get("lastModifiedDateTime"), - "webUrl": item.get("webUrl"), - "type": "folder" if isFolder else "file", - "siteName": siteName, - "siteUrl": siteUrl - } - - # Add file-specific information - if "file" in item: - itemInfo.update({ - "mimeType": item["file"].get("mimeType"), - "downloadUrl": item.get("@microsoft.graph.downloadUrl") - }) - - # Add folder-specific information - if "folder" in item: - itemInfo.update({ - "childCount": item["folder"].get("childCount", 0) - }) - - processedItems.append(itemInfo) - - # If include subfolders is enabled, get ONLY direct subfolder contents (1 level deep only) - if includeSubfolders: - folderItems = [item for item in processedItems if item['type'] == 'folder'] - logger.info(f"Including subfolders - processing {len(folderItems)} folders") - subfolderCount = 0 - maxSubfolders = 10 # Limit to prevent infinite loops - - for item in processedItems[:]: # Use slice to avoid modifying list during iteration - if item["type"] == "folder" and subfolderCount < maxSubfolders: - subfolderCount += 1 - subfolderPath = f"{folderPath.rstrip('/')}/{item['name']}" - subfolderEndpoint = f"sites/{siteId}/drive/items/{item['id']}/children" - - logger.debug(f"Getting contents of subfolder: {item['name']}") - subfolderResult = await self._makeGraphApiCall(subfolderEndpoint) - if "error" not in subfolderResult: - subfolderItems = subfolderResult.get("value", []) - logger.debug(f"Found {len(subfolderItems)} items in subfolder {item['name']}") - - for subfolderItem in subfolderItems: - # Use improved folder detection logic for subfolder items - subfolderIsFolder = self.services.sharepoint.detectFolderType(subfolderItem) - - # Only add files and direct subfolders, NO RECURSION - subfolderItemInfo = { - "id": subfolderItem.get("id"), - "name": subfolderItem.get("name"), - "size": subfolderItem.get("size", 0), - "createdDateTime": subfolderItem.get("createdDateTime"), - "lastModifiedDateTime": subfolderItem.get("lastModifiedDateTime"), - "webUrl": subfolderItem.get("webUrl"), - "type": "folder" if subfolderIsFolder else "file", - "parentPath": subfolderPath, - "siteName": siteName, - "siteUrl": siteUrl - } - - if "file" in subfolderItem: - subfolderItemInfo.update({ - "mimeType": subfolderItem["file"].get("mimeType"), - "downloadUrl": subfolderItem.get("@microsoft.graph.downloadUrl") - }) - - processedItems.append(subfolderItemInfo) - else: - logger.warning(f"Failed to get contents of subfolder {item['name']}: {subfolderResult.get('error')}") - elif subfolderCount >= maxSubfolders: - logger.warning(f"Reached maximum subfolder limit ({maxSubfolders}), skipping remaining folders") - break - - logger.info(f"Processed {subfolderCount} subfolders, total items: {len(processedItems)}") - - folderResults.append({ - "siteName": siteName, - "siteUrl": siteUrl, - "itemCount": len(processedItems), - "items": processedItems - }) - - listResults.append({ - "folderPath": folderPath, - "sitesProcessed": len(folderResults), - "siteResults": folderResults - }) - - except Exception as e: - logger.error(f"Error listing folder {folderPath}: {str(e)}") - listResults.append({ - "folderPath": folderPath, - "error": str(e), - "siteResults": [] - }) - - totalItems = sum(len(result.get("siteResults", [])) for result in listResults) - self.services.chat.progressLogUpdate(operationId, 0.9, f"Found {totalItems} item(s)") - - # Create result data - resultData = { - "pathQuery": listQuery, - "includeSubfolders": includeSubfolders, - "sitesSearched": len(sites), - "listResults": listResults, - "timestamp": self.services.utils.timestampGetUtc() - } - - # Use default JSON format for output - outputExtension = ".json" # Default - outputMimeType = "application/json" # Default - - validationMetadata = { - "actionType": "sharepoint.listDocuments", - "pathQuery": listQuery, - "includeSubfolders": includeSubfolders, - "sitesSearched": len(sites), - "folderCount": len(listResults), - "totalItems": totalItems - } - - self.services.chat.progressLogFinish(operationId, True) - return ActionResult( - success=True, - documents=[ - ActionDocument( - documentName=f"sharepoint_document_list_{self._format_timestamp_for_filename()}{outputExtension}", - documentData=json.dumps(resultData, indent=2), - mimeType=outputMimeType, - validationMetadata=validationMetadata - ) - ] - ) - - except Exception as e: - logger.error(f"Error listing SharePoint documents: {str(e)}") - if operationId: - try: - self.services.chat.progressLogFinish(operationId, False) - except: - pass - return ActionResult( - success=False, - error=str(e) - ) - - @action - async def analyzeFolderUsage(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Analyze usage intensity of folders and files in SharePoint. - - Input requirements: connectionReference (required); documentList (required); optional startDateTime, endDateTime, interval. - - Output format: JSON with usage analytics grouped by time intervals. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - documentList (list, required): Document list reference(s) containing findDocumentPath result. - - startDateTime (str, optional): Start date/time in ISO format (e.g., "2025-11-01T00:00:00Z"). Default: 30 days ago. - - endDateTime (str, optional): End date/time in ISO format (e.g., "2025-11-30T23:59:59Z"). Default: current time. - - interval (str, optional): Time interval for grouping activities. Options: "day", "week", "month". Default: "day". - """ - import time - operationId = None - try: - # Init progress logger - workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" - operationId = f"sharepoint_usage_{workflowId}_{int(time.time())}" - - # Start progress tracking - parentOperationId = parameters.get('parentOperationId') - self.services.chat.progressLogStart( - operationId, - "Analyze Folder Usage", - "SharePoint Analytics", - "Processing document list", - parentOperationId=parentOperationId - ) - - connectionReference = parameters.get("connectionReference") - documentList = parameters.get("documentList") - pathQuery = parameters.get("pathQuery") - if isinstance(documentList, str): - documentList = [documentList] - startDateTime = parameters.get("startDateTime") - endDateTime = parameters.get("endDateTime") - interval = parameters.get("interval", "day") - - if not connectionReference: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Connection reference is required") - - # Require either documentList or pathQuery - if not documentList and (not pathQuery or pathQuery.strip() == "" or pathQuery.strip() == "*"): - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Either documentList or pathQuery is required") - - # Resolve folder/item information from documentList or pathQuery - siteId = None - driveId = None - itemId = None - folderPath = None - folderName = None - - if documentList: - foundDocuments, sites, errorMsg = await self._parseDocumentListForFoundDocuments(documentList) - if errorMsg: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=errorMsg) - - if not foundDocuments: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No documents found in documentList") - - # Get siteId from first document (all should be from same site) - firstItem = foundDocuments[0] - siteId = firstItem.get("siteId") - if not siteId: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Site ID missing from documentList") - - # Get drive ID (needed for analytics) - driveId = await self.services.sharepoint.getDriveId(siteId) - if not driveId: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Could not determine drive ID for the site") - - # If no items from documentList, try pathQuery fallback - if not foundDocuments and pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*": - sites, errorMsg = await self._resolveSitesFromPathQuery(pathQuery) - if errorMsg: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=errorMsg) - - if sites: - siteId = sites[0].get("id") - # Parse pathQuery to find the folder/item - pathQueryParsed, fileQuery, searchType, searchOptions = self._parseSearchQuery(pathQuery) - - # Extract folder path from pathQuery - folderPath = '/' - if pathQueryParsed and pathQueryParsed.startswith('/sites/'): - parsedPath = self._extractSiteFromStandardPath(pathQueryParsed) - if parsedPath: - innerPath = parsedPath.get("innerPath", "") - folderPath = '/' + innerPath if innerPath else '/' - elif pathQueryParsed: - folderPath = pathQueryParsed - - # Get drive ID - driveId = await self.services.sharepoint.getDriveId(siteId) - if not driveId: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Could not determine drive ID for the site") - - # Get folder/item by path - folderInfo = await self.services.sharepoint.getFolderByPath(siteId, folderPath.lstrip('/')) - if not folderInfo: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error=f"Folder or file not found at path: {folderPath}") - - # Add pathQuery item to foundDocuments for processing - foundDocuments = [{ - "id": folderInfo.get("id"), - "name": folderInfo.get("name", ""), - "type": "folder" if folderInfo.get("folder") else "file", - "siteId": siteId, - "fullPath": folderPath, - "webUrl": folderInfo.get("webUrl", "") - }] - - if not siteId or not driveId: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Either documentList must contain findDocumentPath result with folder information, or pathQuery must be provided. Use findDocumentPath first to get folder path, or provide pathQuery directly.") - - self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection") - # Get Microsoft connection - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Set access token - if not self.services.sharepoint.setAccessTokenFromConnection(connection): - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="Failed to set SharePoint access token") - - # Process all items from documentList or pathQuery - # IMPORTANT: Only analyze FOLDERS, not files (action is "analyzeFolderUsage") - itemsToAnalyze = [] - if foundDocuments: - for item in foundDocuments: - itemId = item.get("id") - itemType = item.get("type", "").lower() - - # Only process folders, skip files and site-level items - if itemId and itemType == "folder": - itemsToAnalyze.append({ - "id": itemId, - "name": item.get("name", ""), - "type": itemType, - "path": item.get("fullPath", ""), - "webUrl": item.get("webUrl", "") - }) - - if not itemsToAnalyze: - if operationId: - self.services.chat.progressLogFinish(operationId, False) - return ActionResult.isFailure(error="No valid folders found in documentList to analyze. Note: This action only analyzes folders, not files.") - - self.services.chat.progressLogUpdate(operationId, 0.4, f"Analyzing {len(itemsToAnalyze)} folder(s)") - - # Analyze each item - allAnalytics = [] - totalActivities = 0 - uniqueUsers = set() - activityTypes = {} - - # Compute actual date range values (getFolderUsageAnalytics will set defaults if None) - # We need to compute them here to store in output, since getFolderUsageAnalytics modifies them - actualStartDateTime = startDateTime - actualEndDateTime = endDateTime - if not actualEndDateTime: - actualEndDateTime = datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z') - if not actualStartDateTime: - startDate = datetime.now(timezone.utc) - timedelta(days=30) - actualStartDateTime = startDate.isoformat().replace('+00:00', 'Z') - - for idx, item in enumerate(itemsToAnalyze): - progress = 0.4 + (idx / len(itemsToAnalyze)) * 0.5 - self.services.chat.progressLogUpdate(operationId, progress, f"Analyzing folder {item['name']} ({idx+1}/{len(itemsToAnalyze)})") - - # Get usage analytics for this folder - analyticsResult = await self.services.sharepoint.getFolderUsageAnalytics( - siteId=siteId, - driveId=driveId, - itemId=item["id"], - startDateTime=startDateTime, - endDateTime=endDateTime, - interval=interval - ) - - if "error" in analyticsResult: - logger.warning(f"Failed to get analytics for item {item['name']} ({item['id']}): {analyticsResult['error']}") - # Continue with other items even if one fails - itemAnalytics = { - "itemId": item["id"], - "itemName": item["name"], - "itemType": item["type"], - "itemPath": item["path"], - "error": analyticsResult.get("error", "Unknown error") - } - else: - # Process analytics for this item - itemActivities = 0 - itemUsers = set() - itemActivityTypes = {} - - if "value" in analyticsResult: - for intervalData in analyticsResult["value"]: - activities = intervalData.get("activities", []) - for activity in activities: - itemActivities += 1 - totalActivities += 1 - - action = activity.get("action", {}) - actionType = action.get("verb", "unknown") - itemActivityTypes[actionType] = itemActivityTypes.get(actionType, 0) + 1 - activityTypes[actionType] = activityTypes.get(actionType, 0) + 1 - - actor = activity.get("actor", {}) - userPrincipalName = actor.get("userPrincipalName", "") - if userPrincipalName: - itemUsers.add(userPrincipalName) - uniqueUsers.add(userPrincipalName) - - itemAnalytics = { - "itemId": item["id"], - "itemName": item["name"], - "itemType": item["type"], - "itemPath": item["path"], - "webUrl": item["webUrl"], - "analytics": analyticsResult, - "summary": { - "totalActivities": itemActivities, - "uniqueUsers": len(itemUsers), - "activityTypes": itemActivityTypes - } - } - - # Include note if analytics are not available - if "note" in analyticsResult: - itemAnalytics["note"] = analyticsResult["note"] - - allAnalytics.append(itemAnalytics) - - self.services.chat.progressLogUpdate(operationId, 0.9, "Processing analytics data") - - # Process and format analytics data - resultData = { - "siteId": siteId, - "driveId": driveId, - "startDateTime": actualStartDateTime, # Store computed date range (not None) - "endDateTime": actualEndDateTime, # Store computed date range (not None) - "interval": interval, - "itemsAnalyzed": len(itemsToAnalyze), - "foldersAnalyzed": len([item for item in allAnalytics if item.get("itemType") == "folder"]), - "items": allAnalytics, - "summary": { - "totalActivities": totalActivities, - "uniqueUsers": len(uniqueUsers), - "activityTypes": activityTypes - }, - "note": f"Analyzed {len(itemsToAnalyze)} folder(s) from {actualStartDateTime} to {actualEndDateTime}. " + - f"Found {totalActivities} total activities across {len(uniqueUsers)} unique user(s)." + - (f" Note: {len([item for item in allAnalytics if 'error' in item])} folder(s) had errors or no analytics data available." if any('error' in item for item in allAnalytics) else ""), - "timestamp": self.services.utils.timestampGetUtc() - } - - self.services.chat.progressLogUpdate(operationId, 0.95, f"Found {totalActivities} total activities across {len(itemsToAnalyze)} folder(s)") - - validationMetadata = { - "actionType": "sharepoint.analyzeFolderUsage", - "itemsAnalyzed": len(itemsToAnalyze), - "interval": interval, - "totalActivities": totalActivities, - "uniqueUsers": len(uniqueUsers) - } - - self.services.chat.progressLogFinish(operationId, True) - return ActionResult( - success=True, - documents=[ - ActionDocument( - documentName=f"sharepoint_usage_analysis_{self._format_timestamp_for_filename()}.json", - documentData=json.dumps(resultData, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - ) - ] - ) - - except Exception as e: - logger.error(f"Error analyzing folder usage: {str(e)}") - if operationId: - try: - self.services.chat.progressLogFinish(operationId, False) - except: - pass - return ActionResult( - success=False, - error=str(e) - ) - - @action - async def findSiteByUrl(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Find SharePoint site by hostname and site path. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - hostname (str, required): SharePoint hostname (e.g., "example.sharepoint.com") - - sitePath (str, required): Site path (e.g., "SteeringBPM" or "/sites/SteeringBPM") - - Returns: - - ActionResult with ActionDocument containing site information (id, displayName, name, webUrl) - """ - try: - connectionReference = parameters.get("connectionReference") - if not connectionReference: - return ActionResult.isFailure(error="connectionReference parameter is required") - - hostname = parameters.get("hostname") - if not hostname: - return ActionResult.isFailure(error="hostname parameter is required") - - sitePath = parameters.get("sitePath") - if not sitePath: - return ActionResult.isFailure(error="sitePath parameter is required") - - # Get Microsoft connection - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Find site by URL - siteInfo = await self.services.sharepoint.findSiteByUrl( - hostname=hostname, - sitePath=sitePath - ) - - if not siteInfo: - return ActionResult.isFailure(error=f"Site not found: {hostname}:/sites/{sitePath}") - - logger.info(f"Found SharePoint site: {siteInfo.get('displayName')} (ID: {siteInfo.get('id')})") - - # Generate filename - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "sharepoint_site", - "json", - workflowContext, - "findSiteByUrl" - ) - - validationMetadata = self._createValidationMetadata( - "findSiteByUrl", - hostname=hostname, - sitePath=sitePath, - siteId=siteInfo.get("id") - ) - - document = ActionDocument( - documentName=filename, - documentData=json.dumps(siteInfo, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - errorMsg = f"Error finding SharePoint site: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - @action - async def downloadFileByPath(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Download file from SharePoint by exact file path. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - siteId (str, required): SharePoint site ID (from findSiteByUrl result) or document reference containing site info - - filePath (str, required): Full file path relative to site root (e.g., "/General/50 Docs hosted by SELISE/file.xlsx") - - Returns: - - ActionResult with ActionDocument containing file content as base64-encoded bytes - """ - try: - connectionReference = parameters.get("connectionReference") - if not connectionReference: - return ActionResult.isFailure(error="connectionReference parameter is required") - - siteIdParam = parameters.get("siteId") - if not siteIdParam: - return ActionResult.isFailure(error="siteId parameter is required") - - filePath = parameters.get("filePath") - if not filePath: - return ActionResult.isFailure(error="filePath parameter is required") - - # Extract siteId from document if it's a reference - siteId = None - if isinstance(siteIdParam, str): - # Try to parse from document reference - from modules.datamodels.datamodelDocref import DocumentReferenceList - try: - docList = DocumentReferenceList.from_string_list([siteIdParam]) - chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docList) - if chatDocuments and len(chatDocuments) > 0: - siteInfoJson = json.loads(chatDocuments[0].documentData) - siteId = siteInfoJson.get("id") - except: - pass - - if not siteId: - # Assume it's the site ID directly - siteId = siteIdParam - else: - siteId = siteIdParam - - if not siteId: - return ActionResult.isFailure(error="Could not extract siteId from parameter") - - # Get Microsoft connection - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Download file - fileContent = await self.services.sharepoint.downloadFileByPath( - siteId=siteId, - filePath=filePath - ) - - if fileContent is None: - return ActionResult.isFailure(error=f"File not found or could not be downloaded: {filePath}") - - logger.info(f"Downloaded file from SharePoint: {filePath} ({len(fileContent)} bytes)") - - # Generate filename from filePath - import os - fileName = os.path.basename(filePath) or "downloaded_file" - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - fileName.split('.')[0] if '.' in fileName else fileName, - fileName.split('.')[-1] if '.' in fileName else "bin", - workflowContext, - "downloadFileByPath" - ) - - # Encode as base64 - import base64 - fileBase64 = base64.b64encode(fileContent).decode('utf-8') - - validationMetadata = self._createValidationMetadata( - "downloadFileByPath", - siteId=siteId, - filePath=filePath, - fileSize=len(fileContent) - ) - - document = ActionDocument( - documentName=filename, - documentData=fileBase64, - mimeType="application/octet-stream", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - errorMsg = f"Error downloading file from SharePoint: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - @action - async def copyFile(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Copy file within SharePoint. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - siteId (str, required): SharePoint site ID (from findSiteByUrl result) or document reference containing site info - - sourceFolder (str, required): Source folder path relative to site root - - sourceFile (str, required): Source file name - - destFolder (str, required): Destination folder path relative to site root - - destFile (str, required): Destination file name - - Returns: - - ActionResult with ActionDocument containing copy result - """ - try: - connectionReference = parameters.get("connectionReference") - if not connectionReference: - return ActionResult.isFailure(error="connectionReference parameter is required") - - siteIdParam = parameters.get("siteId") - if not siteIdParam: - return ActionResult.isFailure(error="siteId parameter is required") - - sourceFolder = parameters.get("sourceFolder") - if not sourceFolder: - return ActionResult.isFailure(error="sourceFolder parameter is required") - - sourceFile = parameters.get("sourceFile") - if not sourceFile: - return ActionResult.isFailure(error="sourceFile parameter is required") - - destFolder = parameters.get("destFolder") - if not destFolder: - return ActionResult.isFailure(error="destFolder parameter is required") - - destFile = parameters.get("destFile") - if not destFile: - return ActionResult.isFailure(error="destFile parameter is required") - - # Extract siteId from document if it's a reference - siteId = None - if isinstance(siteIdParam, str): - from modules.datamodels.datamodelDocref import DocumentReferenceList - try: - docList = DocumentReferenceList.from_string_list([siteIdParam]) - chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docList) - if chatDocuments and len(chatDocuments) > 0: - siteInfoJson = json.loads(chatDocuments[0].documentData) - siteId = siteInfoJson.get("id") - except: - pass - - if not siteId: - siteId = siteIdParam - else: - siteId = siteIdParam - - if not siteId: - return ActionResult.isFailure(error="Could not extract siteId from parameter") - - # Get Microsoft connection - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Copy file - await self.services.sharepoint.copyFileAsync( - siteId=siteId, - sourceFolder=sourceFolder, - sourceFile=sourceFile, - destFolder=destFolder, - destFile=destFile - ) - - logger.info(f"Copied file in SharePoint: {sourceFolder}/{sourceFile} -> {destFolder}/{destFile}") - - # Generate filename - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "file_copy_result", - "json", - workflowContext, - "copyFile" - ) - - result = { - "success": True, - "siteId": siteId, - "sourcePath": f"{sourceFolder}/{sourceFile}", - "destPath": f"{destFolder}/{destFile}" - } - - validationMetadata = self._createValidationMetadata( - "copyFile", - siteId=siteId, - sourcePath=f"{sourceFolder}/{sourceFile}", - destPath=f"{destFolder}/{destFile}" - ) - - document = ActionDocument( - documentName=filename, - documentData=json.dumps(result, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - # Handle file not found gracefully - if "itemNotFound" in str(e) or "404" in str(e): - logger.warning(f"File not found for copy: {parameters.get('sourceFolder')}/{parameters.get('sourceFile')}") - # Return success with skipped status - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "file_copy_result", - "json", - workflowContext, - "copyFile" - ) - - result = { - "success": True, - "skipped": True, - "reason": "File not found (may not exist yet)" - } - - validationMetadata = self._createValidationMetadata( - "copyFile", - skipped=True - ) - - document = ActionDocument( - documentName=filename, - documentData=json.dumps(result, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - errorMsg = f"Error copying file in SharePoint: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) - - @action - async def uploadFile(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Upload raw file content (bytes) to SharePoint. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - siteId (str, required): SharePoint site ID (from findSiteByUrl result) or document reference containing site info - - folderPath (str, required): Folder path relative to site root - - fileName (str, required): File name - - content (str, required): Document reference containing file content as base64-encoded bytes - - Returns: - - ActionResult with ActionDocument containing upload result - """ - try: - connectionReference = parameters.get("connectionReference") - if not connectionReference: - return ActionResult.isFailure(error="connectionReference parameter is required") - - siteIdParam = parameters.get("siteId") - if not siteIdParam: - return ActionResult.isFailure(error="siteId parameter is required") - - folderPath = parameters.get("folderPath") - if not folderPath: - return ActionResult.isFailure(error="folderPath parameter is required") - - fileName = parameters.get("fileName") - if not fileName: - return ActionResult.isFailure(error="fileName parameter is required") - - contentParam = parameters.get("content") - if not contentParam: - return ActionResult.isFailure(error="content parameter is required") - - # Extract siteId from document if it's a reference - siteId = None - if isinstance(siteIdParam, str): - from modules.datamodels.datamodelDocref import DocumentReferenceList - try: - docList = DocumentReferenceList.from_string_list([siteIdParam]) - chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docList) - if chatDocuments and len(chatDocuments) > 0: - siteInfoJson = json.loads(chatDocuments[0].documentData) - siteId = siteInfoJson.get("id") - except: - pass - - if not siteId: - siteId = siteIdParam - else: - siteId = siteIdParam - - if not siteId: - return ActionResult.isFailure(error="Could not extract siteId from parameter") - - # Get file content from document - from modules.datamodels.datamodelDocref import DocumentReferenceList - docList = DocumentReferenceList.from_string_list([contentParam] if isinstance(contentParam, str) else contentParam) - chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docList) - if not chatDocuments or len(chatDocuments) == 0: - return ActionResult.isFailure(error="Could not get file content from document reference") - - fileContentBase64 = chatDocuments[0].documentData - - # Decode base64 - import base64 - try: - fileContent = base64.b64decode(fileContentBase64) - except Exception as e: - return ActionResult.isFailure(error=f"Could not decode base64 file content: {str(e)}") - - # Get Microsoft connection - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - - # Upload file - uploadResult = await self.services.sharepoint.uploadFile( - siteId=siteId, - folderPath=folderPath, - fileName=fileName, - content=fileContent - ) - - if "error" in uploadResult: - return ActionResult.isFailure(error=f"Upload failed: {uploadResult['error']}") - - logger.info(f"Uploaded file to SharePoint: {folderPath}/{fileName} ({len(fileContent)} bytes)") - - # Generate filename - workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None - filename = self._generateMeaningfulFileName( - "file_upload_result", - "json", - workflowContext, - "uploadFile" - ) - - result = { - "success": True, - "siteId": siteId, - "filePath": f"{folderPath}/{fileName}", - "fileSize": len(fileContent), - "uploadResult": uploadResult - } - - validationMetadata = self._createValidationMetadata( - "uploadFile", - siteId=siteId, - filePath=f"{folderPath}/{fileName}", - fileSize=len(fileContent) - ) - - document = ActionDocument( - documentName=filename, - documentData=json.dumps(result, indent=2), - mimeType="application/json", - validationMetadata=validationMetadata - ) - - return ActionResult.isSuccess(documents=[document]) - - except Exception as e: - errorMsg = f"Error uploading file to SharePoint: {str(e)}" - logger.error(errorMsg) - return ActionResult.isFailure(error=errorMsg) \ No newline at end of file diff --git a/modules/workflows/methods/methodSharepoint/actions/analyzeFolderUsage.py b/modules/workflows/methods/methodSharepoint/actions/analyzeFolderUsage.py index 075c8b96..a4bf18b6 100644 --- a/modules/workflows/methods/methodSharepoint/actions/analyzeFolderUsage.py +++ b/modules/workflows/methods/methodSharepoint/actions/analyzeFolderUsage.py @@ -1,36 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Analyze Folder Usage action for SharePoint operations. -Analyzes usage intensity of folders and files in SharePoint. -""" - import logging import time import json from datetime import datetime, timezone, timedelta from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def analyzeFolderUsage(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Analyze usage intensity of folders and files in SharePoint. - - Input requirements: connectionReference (required); documentList (required); optional startDateTime, endDateTime, interval. - - Output format: JSON with usage analytics grouped by time intervals. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - documentList (list, required): Document list reference(s) containing findDocumentPath result. - - startDateTime (str, optional): Start date/time in ISO format (e.g., "2025-11-01T00:00:00Z"). Default: 30 days ago. - - endDateTime (str, optional): End date/time in ISO format (e.g., "2025-11-30T23:59:59Z"). Default: current time. - - interval (str, optional): Time interval for grouping activities. Options: "day", "week", "month". Default: "day". - """ operationId = None try: # Init progress logger diff --git a/modules/workflows/methods/methodSharepoint/actions/copyFile.py b/modules/workflows/methods/methodSharepoint/actions/copyFile.py index 1b6d821d..f149e482 100644 --- a/modules/workflows/methods/methodSharepoint/actions/copyFile.py +++ b/modules/workflows/methods/methodSharepoint/actions/copyFile.py @@ -1,35 +1,14 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Copy File action for SharePoint operations. -Copies file within SharePoint. -""" - import logging import json from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def copyFile(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Copy file within SharePoint. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - siteId (str, required): SharePoint site ID (from findSiteByUrl result) or document reference containing site info - - sourceFolder (str, required): Source folder path relative to site root - - sourceFile (str, required): Source file name - - destFolder (str, required): Destination folder path relative to site root - - destFile (str, required): Destination file name - - Returns: - - ActionResult with ActionDocument containing copy result - """ try: connectionReference = parameters.get("connectionReference") if not connectionReference: diff --git a/modules/workflows/methods/methodSharepoint/actions/downloadFileByPath.py b/modules/workflows/methods/methodSharepoint/actions/downloadFileByPath.py index d6e291a8..c64a6637 100644 --- a/modules/workflows/methods/methodSharepoint/actions/downloadFileByPath.py +++ b/modules/workflows/methods/methodSharepoint/actions/downloadFileByPath.py @@ -1,34 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Download File By Path action for SharePoint operations. -Downloads file from SharePoint by exact file path. -""" - import logging import json import base64 import os from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def downloadFileByPath(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Download file from SharePoint by exact file path. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - siteId (str, required): SharePoint site ID (from findSiteByUrl result) or document reference containing site info - - filePath (str, required): Full file path relative to site root (e.g., "/General/50 Docs hosted by SELISE/file.xlsx") - - Returns: - - ActionResult with ActionDocument containing file content as base64-encoded bytes - """ try: connectionReference = parameters.get("connectionReference") if not connectionReference: diff --git a/modules/workflows/methods/methodSharepoint/actions/findDocumentPath.py b/modules/workflows/methods/methodSharepoint/actions/findDocumentPath.py index 01c1baf3..722dbc99 100644 --- a/modules/workflows/methods/methodSharepoint/actions/findDocumentPath.py +++ b/modules/workflows/methods/methodSharepoint/actions/findDocumentPath.py @@ -1,35 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Find Document Path action for SharePoint operations. -Finds documents and folders by name/path across SharePoint sites. -""" - import logging import time import json import urllib.parse from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def findDocumentPath(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Find documents and folders by name/path across sites. - - Input requirements: connectionReference (required); searchQuery (required); optional site, maxResults. - - Output format: JSON with found items and paths. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - site (str, optional): Site hint. - - searchQuery (str, required): Search terms or path. - - maxResults (int, optional): Maximum items to return. Default: 1000. - """ operationId = None try: # Init progress logger diff --git a/modules/workflows/methods/methodSharepoint/actions/findSiteByUrl.py b/modules/workflows/methods/methodSharepoint/actions/findSiteByUrl.py index 405b35f2..62b6dd94 100644 --- a/modules/workflows/methods/methodSharepoint/actions/findSiteByUrl.py +++ b/modules/workflows/methods/methodSharepoint/actions/findSiteByUrl.py @@ -1,32 +1,14 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Find Site By URL action for SharePoint operations. -Finds SharePoint site by hostname and site path. -""" - import logging import json from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def findSiteByUrl(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Find SharePoint site by hostname and site path. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - hostname (str, required): SharePoint hostname (e.g., "example.sharepoint.com") - - sitePath (str, required): Site path (e.g., "SteeringBPM" or "/sites/SteeringBPM") - - Returns: - - ActionResult with ActionDocument containing site information (id, displayName, name, webUrl) - """ try: connectionReference = parameters.get("connectionReference") if not connectionReference: diff --git a/modules/workflows/methods/methodSharepoint/actions/listDocuments.py b/modules/workflows/methods/methodSharepoint/actions/listDocuments.py index 78aabadc..318271c3 100644 --- a/modules/workflows/methods/methodSharepoint/actions/listDocuments.py +++ b/modules/workflows/methods/methodSharepoint/actions/listDocuments.py @@ -1,34 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -List Documents action for SharePoint operations. -Lists documents and folders in SharePoint paths across sites. -""" - import logging import time import json import urllib.parse from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def listDocuments(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: List documents and folders in SharePoint paths across sites. - - Input requirements: connectionReference (required); documentList (required); includeSubfolders (optional). - - Output format: JSON with folder items and metadata. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - documentList (list, required): Document list reference(s) containing findDocumentPath result. - - includeSubfolders (bool, optional): Include one level of subfolders. Default: False. - """ operationId = None try: # Init progress logger diff --git a/modules/workflows/methods/methodSharepoint/actions/readDocuments.py b/modules/workflows/methods/methodSharepoint/actions/readDocuments.py index 2bc2688c..73cdb730 100644 --- a/modules/workflows/methods/methodSharepoint/actions/readDocuments.py +++ b/modules/workflows/methods/methodSharepoint/actions/readDocuments.py @@ -1,44 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Read Documents action for SharePoint operations. -Reads documents from SharePoint and extracts content/metadata. -""" - import logging import time import json import base64 from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def readDocuments(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Read documents from SharePoint and extract content/metadata. - - Input requirements: connectionReference (required); documentList or pathQuery (required); includeMetadata (optional). - - Output format: Standardized ActionDocument format (documentName, documentData, mimeType). - - Binary files (PDFs, etc.) are Base64-encoded in documentData. - - Text files are stored as plain text in documentData. - - Returns ActionResult with documents list for template processing. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - documentList (list, optional): Document list reference(s) containing findDocumentPath result. - - pathQuery (str, optional): Direct path query if no documentList (e.g., /sites/SiteName/FolderPath). - - includeMetadata (bool, optional): Include metadata. Default: True. - - Returns: - - ActionResult with documents: List[ActionDocument] where each ActionDocument contains: - - documentName: File name - - documentData: Base64-encoded content (binary files) or plain text (text files) - - mimeType: MIME type (e.g., application/pdf, text/plain) - """ operationId = None try: # Init progress logger diff --git a/modules/workflows/methods/methodSharepoint/actions/uploadDocument.py b/modules/workflows/methods/methodSharepoint/actions/uploadDocument.py index 82c93434..cfe4cf86 100644 --- a/modules/workflows/methods/methodSharepoint/actions/uploadDocument.py +++ b/modules/workflows/methods/methodSharepoint/actions/uploadDocument.py @@ -1,34 +1,16 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Upload Document action for SharePoint operations. -Uploads documents to SharePoint. -""" - import logging import time import json import urllib.parse from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def uploadDocument(self, parameters: Dict[str, Any]) -> ActionResult: - """ - GENERAL: - - Purpose: Upload documents to SharePoint. Only to choose this action with a connectionReference - - Input requirements: connectionReference (required); documentList (required); pathQuery (optional). - - Output format: JSON with upload status and file info. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - documentList (list, required): Document reference(s) to upload. File names are taken from the documents. - - pathQuery (str, optional): Direct upload target path if documentList doesn't contain findDocumentPath result (e.g., /sites/SiteName/FolderPath). - """ operationId = None try: # Init progress logger diff --git a/modules/workflows/methods/methodSharepoint/actions/uploadFile.py b/modules/workflows/methods/methodSharepoint/actions/uploadFile.py index 3d8a9499..1f469b80 100644 --- a/modules/workflows/methods/methodSharepoint/actions/uploadFile.py +++ b/modules/workflows/methods/methodSharepoint/actions/uploadFile.py @@ -1,35 +1,15 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -""" -Upload File action for SharePoint operations. -Uploads raw file content (bytes) to SharePoint. -""" - import logging import json import base64 from typing import Dict, Any -from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) -@action async def uploadFile(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Upload raw file content (bytes) to SharePoint. - - Parameters: - - connectionReference (str, required): Microsoft connection label. - - siteId (str, required): SharePoint site ID (from findSiteByUrl result) or document reference containing site info - - folderPath (str, required): Folder path relative to site root - - fileName (str, required): File name - - content (str, required): Document reference containing file content as base64-encoded bytes - - Returns: - - ActionResult with ActionDocument containing upload result - """ try: connectionReference = parameters.get("connectionReference") if not connectionReference: diff --git a/modules/workflows/processing/adaptive/contentValidator.py b/modules/workflows/processing/adaptive/contentValidator.py index 36673ed0..fe0ee5bd 100644 --- a/modules/workflows/processing/adaptive/contentValidator.py +++ b/modules/workflows/processing/adaptive/contentValidator.py @@ -24,7 +24,7 @@ class ContentValidator: self.services = services self.learningEngine = learningEngine - async def validateContent(self, documents: List[Any], intent: Dict[str, Any], taskStep: Optional[Any] = None, actionName: Optional[str] = None, actionParameters: Optional[Dict[str, Any]] = None, actionHistory: Optional[List[Dict[str, Any]]] = None) -> Dict[str, Any]: + async def validateContent(self, documents: List[Any], intent: Dict[str, Any], taskStep: Optional[Any] = None, actionName: Optional[str] = None, actionParameters: Optional[Dict[str, Any]] = None, actionHistory: Optional[List[Dict[str, Any]]] = None, context: Optional[Any] = None) -> Dict[str, Any]: """Validates delivered content against user intent using AI (single attempt; parse-or-fail) Args: @@ -34,8 +34,9 @@ class ContentValidator: actionName: Optional action name (e.g., "ai.process", "ai.webResearch") that created the documents actionParameters: Optional action parameters used during execution (e.g., {"columnsPerRow": 10, "researchDepth": "deep"}) actionHistory: Optional list of previously executed actions in the workflow (for multi-step workflow context) + context: Optional context object to access all documents delivered in the current round """ - return await self._validateWithAI(documents, intent, taskStep, actionName, actionParameters, actionHistory) + return await self._validateWithAI(documents, intent, taskStep, actionName, actionParameters, actionHistory, context) def _summarizeJsonStructure(self, jsonData: Any) -> Dict[str, Any]: """Summarize JSON document structure for validation - extracts main objects, statistics, captions, and IDs.""" @@ -533,7 +534,7 @@ class ContentValidator: return False - async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any], taskStep: Optional[Any] = None, actionName: Optional[str] = None, actionParameters: Optional[Dict[str, Any]] = None, actionHistory: Optional[List[Dict[str, Any]]] = None) -> Dict[str, Any]: + async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any], taskStep: Optional[Any] = None, actionName: Optional[str] = None, actionParameters: Optional[Dict[str, Any]] = None, actionHistory: Optional[List[Dict[str, Any]]] = None, context: Optional[Any] = None) -> Dict[str, Any]: """AI-based comprehensive validation - generic approach""" try: if not hasattr(self, 'services') or not self.services or not hasattr(self.services, 'ai'): @@ -636,9 +637,46 @@ class ContentValidator: actionHistoryContext = f"\n\n=== ACTION HISTORY ===\n" + "\n".join(f"- {entry}" for entry in historyEntries) actionHistoryContext += "\n\nIMPORTANT: This shows the complete workflow that produced the documents. For process-oriented criteria (e.g., 'internet search performed'), check ACTION HISTORY first. Document metadata may only reflect the LAST action, not the entire workflow." + # Build document index context (all documents delivered in current round) + documentIndexContext = "" + if context and self.services and hasattr(self.services, 'chat') and hasattr(self.services, 'workflow') and self.services.workflow: + try: + documentIndex = self.services.chat.getAvailableDocuments(self.services.workflow) + if documentIndex and documentIndex.strip() and documentIndex != "No documents available": + # Extract only "Current round documents" section if present + lines = documentIndex.split('\n') + currentRoundSection = [] + inCurrentRound = False + for line in lines: + if "Current round documents:" in line: + inCurrentRound = True + currentRoundSection.append(line) + elif inCurrentRound: + if line.strip().startswith("- docList:") or line.strip().startswith(" - docItem:") or line.strip().startswith("- docItem:"): + currentRoundSection.append(line) + elif line.strip() == "": + # Empty line is okay, continue + continue + elif "Past rounds documents:" in line or "AVAILABLE_CONNECTIONS_INDEX:" in line: + # End of current round section + break + else: + # Still in current round section + currentRoundSection.append(line) + + if currentRoundSection: + documentIndexContext = "\n\n=== ALL DOCUMENTS DELIVERED IN CURRENT ROUND ===\n" + "\n".join(currentRoundSection) + documentIndexContext += "\n\nIMPORTANT: This shows ALL documents that have been delivered in the current round, not just the ones being validated in this step. Use this to check if all required formats/documents are present across the entire round." + except Exception as e: + logger.warning(f"Error extracting document index for validation: {str(e)}") + # Continue without document index - not critical + + # Transform criteria that require data access into metadata-only checks + transformedCriteria = self._transformCriteriaForMetadataOnly(successCriteria) + # Format success criteria for display with index numbers - if successCriteria: - criteriaDisplay = "\n".join([f"[{i}] {criterion}" for i, criterion in enumerate(successCriteria)]) + if transformedCriteria: + criteriaDisplay = "\n".join([f"[{i}] {criterion}" for i, criterion in enumerate(transformedCriteria)]) else: criteriaDisplay = "[]" @@ -647,7 +685,7 @@ class ContentValidator: === TASK INFORMATION === {objectiveLabel}: '{objectiveText}' EXPECTED DATA TYPE: {dataType} -EXPECTED FORMATS: {expectedFormats if expectedFormats else ['any']}{actionContext}{actionParamsContext}{validationMetadataContext}{actionHistoryContext} +EXPECTED FORMATS: {expectedFormats if expectedFormats else ['any']}{actionContext}{actionParamsContext}{validationMetadataContext}{actionHistoryContext}{documentIndexContext} === VALIDATION INSTRUCTIONS === @@ -661,6 +699,7 @@ VALIDATION RULES: 5. PROCESS VALIDATION: Use ACTION HISTORY for process-oriented criteria (e.g., "search performed", "extraction done"). 6. ONE CRITERION PER EVALUATION: Evaluate each criterion independently. Do not mention other criteria. 7. NO ASSUMPTIONS: Do NOT assume content was AI-generated vs extracted. If a section exists with content_type, the content was delivered. Only validate what is present in the metadata. +8. DATA-LEVEL CRITERIA TRANSFORMATION: Criteria mentioning accuracy percentages (e.g., "95% accuracy"), completeness percentages (e.g., "98% completeness"), or "all X extracted" have been transformed to metadata-only checks. For accuracy/completeness: Check if contentPartIds reference all source documents and if structure metadata shows expected data types (tables, lists, etc.) exist. For "all X extracted": Check if contentPartIds reference all source documents mentioned in ACTION HISTORY or document index. NEVER attempt to verify accuracy/completeness by comparing actual data values - only use metadata indicators. VALIDATION STEPS: - Check ACTION HISTORY for process-oriented criteria @@ -812,6 +851,52 @@ DELIVERED DOCUMENTS ({len(documents)} items): logger.error(f"AI validation failed: {str(e)}") raise + def _transformCriteriaForMetadataOnly(self, criteria: List[str]) -> List[str]: + """ + Transform criteria that require data access into metadata-only checks. + + Preserves original criterion intent while converting data-level checks to metadata checks. + Examples: + - "95% accuracy" → "[METADATA ONLY] Data structure indicates extraction completed (check contentPartIds reference all source documents)" + - "98% completeness" → "[METADATA ONLY] All source documents referenced in contentPartIds (verify source count matches)" + - "all transactions extracted" → "[METADATA ONLY] All source documents referenced in contentPartIds (verify source count matches)" + """ + if not criteria: + return [] + + transformed = [] + for criterion in criteria: + original = criterion.strip() + transformed_criterion = original + + # Pattern: accuracy percentage (e.g., "95% accuracy", "accuracy meets or exceeds 95% threshold") + if re.search(r'\d+%?\s*accuracy|accuracy.*\d+%', original, re.IGNORECASE): + # Extract the main subject (e.g., "transactions", "data", etc.) + subject_match = re.search(r'(transactions?|data|items?|records?|entries?)', original, re.IGNORECASE) + subject = subject_match.group(1).lower() if subject_match else "data" + + transformed_criterion = f"[METADATA ONLY] {original}: Check that contentPartIds reference all source documents and jsonStructure shows expected {subject} structure exists (tables/lists with rowCount/itemCount > 0). Cannot verify actual {subject} accuracy values from metadata." + + # Pattern: completeness percentage or "all X extracted" (e.g., "98% completeness", "all transactions extracted") + elif re.search(r'\d+%?\s*completeness|completeness.*\d+%|all\s+.*extracted|extract.*all', original, re.IGNORECASE): + # Extract the main subject + subject_match = re.search(r'(transactions?|data|items?|records?|entries?|statements?|documents?)', original, re.IGNORECASE) + subject = subject_match.group(1).lower() if subject_match else "items" + + transformed_criterion = f"[METADATA ONLY] {original}: Verify that contentPartIds reference all source documents mentioned in ACTION HISTORY/document index, and jsonStructure shows {subject} structure exists (check rowCount/itemCount in tables/lists). Cannot verify actual {subject} count from metadata." + + # Pattern: "no missing data" or "no incorrect data" + elif re.search(r'no\s+missing|no\s+incorrect|no\s+errors?', original, re.IGNORECASE): + transformed_criterion = f"[METADATA ONLY] {original}: Check that jsonStructure.content_type shows expected data types present (tables, lists, etc.) and contentPreview.looksLikeRenderedContent=true. Cannot verify actual data values from metadata." + + # Pattern: data accuracy without percentage (e.g., "data is accurate", "accurate data") + elif re.search(r'data.*accurate|accurate.*data', original, re.IGNORECASE) and '%' not in original: + transformed_criterion = f"[METADATA ONLY] {original}: Check that contentPartIds reference source documents and jsonStructure shows expected data structure exists. Cannot verify actual data accuracy values from metadata." + + transformed.append(transformed_criterion) + + return transformed + def _createFailedValidationResult(self, errorMessage: str) -> Dict[str, Any]: """Create a standardized failed validation result""" return { diff --git a/modules/workflows/processing/modes/modeDynamic.py b/modules/workflows/processing/modes/modeDynamic.py index 50889b22..92e04e96 100644 --- a/modules/workflows/processing/modes/modeDynamic.py +++ b/modules/workflows/processing/modes/modeDynamic.py @@ -158,7 +158,7 @@ class DynamicMode(BaseMode): actionName = selection.get('action', 'unknown') actionParameters = selection.get('parameters', {}) actionHistory = getattr(context, 'executedActions', None) if hasattr(context, 'executedActions') else None - validationResult = await self.contentValidator.validateContent(result.documents, self.workflowIntent, taskStep, actionName, actionParameters, actionHistory) + validationResult = await self.contentValidator.validateContent(result.documents, self.workflowIntent, taskStep, actionName, actionParameters, actionHistory, context) observation.contentValidation = validationResult quality_score = validationResult.get('qualityScore', 0.0) if quality_score is None: @@ -194,6 +194,31 @@ class DynamicMode(BaseMode): if decision: # Only append if decision is not None context.previousReviewResult.append(decision) + # Send ChatLog message if userMessage is present in refinement response + if decision and decision.userMessage: + try: + currentRound = getattr(workflow, 'currentRound', 0) + currentTask = getattr(workflow, 'currentTask', 0) + + messageData = { + "workflowId": workflow.id, + "role": "assistant", + "message": decision.userMessage, + "status": "refinement", + "sequenceNr": len(workflow.messages) + 1, + "publishedAt": self.services.utils.timestampGetUtc(), + "documentsLabel": None, + "documents": [], + "roundNumber": currentRound, + "taskNumber": currentTask, + "actionNumber": step + } + + self.services.chat.storeMessageWithDocuments(workflow, messageData, []) + logger.info(f"Sent refinement userMessage to UI: {decision.userMessage[:100]}...") + except Exception as e: + logger.warning(f"Failed to send refinement userMessage to UI: {str(e)}") + # Store next action guidance from decision for use in next iteration if decision and decision.status == "continue" and decision.nextAction: # Set nextActionGuidance directly (now defined in TaskContext model) diff --git a/tests/functional/test10_document_generation_formats.py b/tests/functional/test10_document_generation_formats.py index 8d963643..9ce9b367 100644 --- a/tests/functional/test10_document_generation_formats.py +++ b/tests/functional/test10_document_generation_formats.py @@ -413,12 +413,11 @@ class DocumentGenerationFormatsTester10: async def testAllFormats(self) -> Dict[str, Any]: """Test document generation in DOCX, XLSX, PPTX, PDF, and HTML formats.""" print("\n" + "="*80) - print("TESTING DOCUMENT GENERATION IN HTML FORMAT") + print("TESTING DOCUMENT GENERATION IN ALL FORMATS") print("="*80) - # Only test HTML format - formats = ["html"] - # formats = ["docx", "xlsx", "pptx", "pdf", "html"] # Commented out other formats + # Test all document formats + formats = ["docx", "xlsx", "pptx", "pdf", "html"] results = {} for format in formats: @@ -471,7 +470,7 @@ class DocumentGenerationFormatsTester10: async def runTest(self): """Run the complete test.""" print("\n" + "="*80) - print("DOCUMENT GENERATION FORMATS TEST 10 - HTML ONLY") + print("DOCUMENT GENERATION FORMATS TEST 10 - ALL FORMATS") print("="*80) try: