""" Context and workflow information method module. Handles workflow context queries and document indexing. """ import time import json import logging from typing import Dict, Any, List from datetime import datetime, UTC from modules.workflows.methods.methodBase import MethodBase, action from modules.datamodels.datamodelChat import ActionResult, ActionDocument from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy logger = logging.getLogger(__name__) class MethodContext(MethodBase): """Context and workflow information methods.""" def __init__(self, services): super().__init__(services) self.name = "context" self.description = "Context and workflow information methods" @action async def getDocumentIndex(self, parameters: Dict[str, Any]) -> ActionResult: """ GENERAL: - Purpose: Generate a comprehensive index of all documents available in the current workflow, including documents from all rounds and tasks. - Input requirements: No input documents required. Optional resultType parameter. - Output format: Structured document index in JSON format (default) or text format, listing all documents with their references, metadata, and organization by rounds/tasks. Parameters: - resultType (str, optional): Output format (json, txt, md). Default: json. """ try: workflow = self.services.workflow if not workflow: return ActionResult.isFailure( error="No workflow available" ) resultType = parameters.get("resultType", "json").lower().strip().lstrip('.') # Get available documents index from chat service documentsIndex = self.services.chat.getAvailableDocuments(workflow) if not documentsIndex or documentsIndex == "No documents available" or documentsIndex == "NO DOCUMENTS AVAILABLE - This workflow has no documents to process.": # Return empty index structure if resultType == "json": indexData = { "workflowId": getattr(workflow, 'id', 'unknown'), "totalDocuments": 0, "rounds": [], "documentReferences": [] } indexContent = json.dumps(indexData, indent=2, ensure_ascii=False) else: indexContent = "Document Index\n==============\n\nNo documents available in this workflow.\n" else: # Parse the document index string to extract structured information indexData = self._parseDocumentIndex(documentsIndex, workflow) if resultType == "json": indexContent = json.dumps(indexData, indent=2, ensure_ascii=False) elif resultType == "md": indexContent = self._formatAsMarkdown(indexData) else: # txt indexContent = self._formatAsText(indexData, documentsIndex) # Generate meaningful filename workflowContext = self.services.chat.getWorkflowContext() filename = self._generateMeaningfulFileName( "document_index", resultType if resultType in ["json", "txt", "md"] else "json", workflowContext, "getDocumentIndex" ) validationMetadata = { "actionType": "context.getDocumentIndex", "resultType": resultType, "workflowId": getattr(workflow, 'id', 'unknown'), "totalDocuments": indexData.get("totalDocuments", 0) if isinstance(indexData, dict) else 0 } # Create ActionDocument document = ActionDocument( documentName=filename, documentData=indexContent, mimeType="application/json" if resultType == "json" else "text/plain", validationMetadata=validationMetadata ) return ActionResult.isSuccess(documents=[document]) except Exception as e: logger.error(f"Error generating document index: {str(e)}") return ActionResult.isFailure( error=f"Failed to generate document index: {str(e)}" ) def _parseDocumentIndex(self, documentsIndex: str, workflow: Any) -> Dict[str, Any]: """Parse the document index string into structured data.""" try: indexData = { "workflowId": getattr(workflow, 'id', 'unknown'), "generatedAt": datetime.now(UTC).isoformat(), "totalDocuments": 0, "rounds": [], "documentReferences": [] } # Extract document references from the index string lines = documentsIndex.split('\n') currentRound = None currentDocList = None for line in lines: line = line.strip() if not line: continue # Check for round headers if "Current round documents:" in line: currentRound = "current" continue elif "Past rounds documents:" in line: currentRound = "past" continue # Check for document list references (docList:...) if line.startswith("- docList:"): docListRef = line.replace("- docList:", "").strip() currentDocList = { "reference": docListRef, "round": currentRound, "documents": [] } indexData["rounds"].append(currentDocList) continue # Check for individual document references (docItem:...) if line.startswith(" - docItem:") or line.startswith("- docItem:"): docItemRef = line.replace(" - docItem:", "").replace("- docItem:", "").strip() indexData["documentReferences"].append({ "reference": docItemRef, "round": currentRound, "docList": currentDocList["reference"] if currentDocList else None }) indexData["totalDocuments"] += 1 if currentDocList: currentDocList["documents"].append(docItemRef) return indexData except Exception as e: logger.error(f"Error parsing document index: {str(e)}") return { "workflowId": getattr(workflow, 'id', 'unknown'), "error": f"Failed to parse document index: {str(e)}", "rawIndex": documentsIndex } def _formatAsMarkdown(self, indexData: Dict[str, Any]) -> str: """Format document index as Markdown.""" try: md = f"# Document Index\n\n" md += f"**Workflow ID:** {indexData.get('workflowId', 'unknown')}\n\n" md += f"**Generated At:** {indexData.get('generatedAt', 'unknown')}\n\n" md += f"**Total Documents:** {indexData.get('totalDocuments', 0)}\n\n" if indexData.get('rounds'): md += "## Documents by Round\n\n" for roundInfo in indexData['rounds']: roundLabel = roundInfo.get('round', 'unknown').title() md += f"### {roundLabel} Round\n\n" md += f"**Document List:** `{roundInfo.get('reference', 'unknown')}`\n\n" if roundInfo.get('documents'): md += "**Documents:**\n\n" for docRef in roundInfo['documents']: md += f"- `{docRef}`\n" md += "\n" if indexData.get('documentReferences'): md += "## All Document References\n\n" for docRef in indexData['documentReferences']: md += f"- `{docRef.get('reference', 'unknown')}`\n" return md except Exception as e: logger.error(f"Error formatting as Markdown: {str(e)}") return f"# Document Index\n\nError formatting index: {str(e)}\n" def _formatAsText(self, indexData: Dict[str, Any], rawIndex: str) -> str: """Format document index as plain text.""" try: text = "Document Index\n" text += "=" * 50 + "\n\n" text += f"Workflow ID: {indexData.get('workflowId', 'unknown')}\n" text += f"Generated At: {indexData.get('generatedAt', 'unknown')}\n" text += f"Total Documents: {indexData.get('totalDocuments', 0)}\n\n" # Include the raw formatted index for readability text += rawIndex return text except Exception as e: logger.error(f"Error formatting as text: {str(e)}") return f"Document Index\n\nError formatting index: {str(e)}\n\nRaw index:\n{rawIndex}\n" @action async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult: """ Extract content from documents (separate from AI calls). This action performs pure content extraction without AI processing. The extracted ContentParts can then be used by subsequent AI processing actions. Parameters: - documentList (list, required): Document reference(s) to extract content from. - extractionOptions (dict, optional): Extraction options (if not provided, defaults are used). Returns: - ActionResult with ActionDocument containing ContentExtracted objects - ContentExtracted.parts contains List[ContentPart] (already chunked if needed) """ try: # Init progress logger workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" operationId = f"context_extract_{workflowId}_{int(time.time())}" # Extract documentList from parameters dict from modules.datamodels.datamodelDocref import DocumentReferenceList documentListParam = parameters.get("documentList") if not documentListParam: return ActionResult.isFailure(error="documentList is required") # Convert to DocumentReferenceList if needed if isinstance(documentListParam, DocumentReferenceList): documentList = documentListParam elif isinstance(documentListParam, str): documentList = DocumentReferenceList.from_string_list([documentListParam]) elif isinstance(documentListParam, list): documentList = DocumentReferenceList.from_string_list(documentListParam) else: return ActionResult.isFailure(error=f"Invalid documentList type: {type(documentListParam)}") # Start progress tracking parentOperationId = parameters.get('parentOperationId') self.services.chat.progressLogStart( operationId, "Extracting content from documents", "Content Extraction", f"Documents: {len(documentList.references)}", parentOperationId=parentOperationId ) # Get ChatDocuments from documentList self.services.chat.progressLogUpdate(operationId, 0.2, "Loading documents") chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList) if not chatDocuments: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="No documents found in documentList") logger.info(f"Extracting content from {len(chatDocuments)} documents") # Prepare extraction options self.services.chat.progressLogUpdate(operationId, 0.3, "Preparing extraction options") extractionOptionsParam = parameters.get("extractionOptions") # Convert dict to ExtractionOptions object if needed, or create defaults if extractionOptionsParam: if isinstance(extractionOptionsParam, dict): # Convert dict to ExtractionOptions object extractionOptions = ExtractionOptions(**extractionOptionsParam) elif isinstance(extractionOptionsParam, ExtractionOptions): extractionOptions = extractionOptionsParam else: # Invalid type, use defaults extractionOptions = None else: extractionOptions = None # If extractionOptions not provided, create defaults if not extractionOptions: # Default extraction options for pure content extraction (no AI processing) extractionOptions = ExtractionOptions( prompt="Extract all content from the document", mergeStrategy=MergeStrategy( mergeType="concatenate", groupBy="typeGroup", orderBy="id" ), processDocumentsIndividually=True ) # Call extraction service with hierarchical progress logging self.services.chat.progressLogUpdate(operationId, 0.4, "Initiating") self.services.chat.progressLogUpdate(operationId, 0.5, f"Extracting content from {len(chatDocuments)} documents") # Pass operationId for hierarchical per-document progress logging extractedResults = self.services.extraction.extractContent(chatDocuments, extractionOptions, operationId=operationId) # Build ActionDocuments from ContentExtracted results self.services.chat.progressLogUpdate(operationId, 0.8, "Building result documents") actionDocuments = [] # Map extracted results back to original documents by index (results are in same order) for i, extracted in enumerate(extractedResults): # Get original document name if available originalDoc = chatDocuments[i] if i < len(chatDocuments) else None if originalDoc and hasattr(originalDoc, 'fileName') and originalDoc.fileName: # Use original filename with "extracted_" prefix baseName = originalDoc.fileName.rsplit('.', 1)[0] if '.' in originalDoc.fileName else originalDoc.fileName documentName = f"{baseName}_extracted_{extracted.id}.json" else: # Fallback to generic name with index documentName = f"document_{i+1:03d}_extracted_{extracted.id}.json" # Store ContentExtracted object in ActionDocument.documentData validationMetadata = { "actionType": "context.extractContent", "documentIndex": i, "extractedId": extracted.id, "partCount": len(extracted.parts) if extracted.parts else 0, "originalFileName": originalDoc.fileName if originalDoc and hasattr(originalDoc, 'fileName') else None } actionDoc = ActionDocument( documentName=documentName, documentData=extracted, # ContentExtracted object mimeType="application/json", validationMetadata=validationMetadata ) actionDocuments.append(actionDoc) self.services.chat.progressLogFinish(operationId, True) return ActionResult.isSuccess(documents=actionDocuments) except Exception as e: logger.error(f"Error in content extraction: {str(e)}") # Complete progress tracking with failure try: self.services.chat.progressLogFinish(operationId, False) except: pass # Don't fail on progress logging errors return ActionResult.isFailure(error=str(e))