diff --git a/.gitignore b/.gitignore index eb6d2935..df4b0c6c 100644 --- a/.gitignore +++ b/.gitignore @@ -167,4 +167,5 @@ cython_debug/ # local data gwserver/_database* gwserver/results/* -*.log.* \ No newline at end of file +*.log.* +test-chat \ No newline at end of file diff --git a/env_dev.env b/env_dev.env index 6e5ed09a..9ebbb93b 100644 --- a/env_dev.env +++ b/env_dev.env @@ -64,12 +64,12 @@ Connector_AiAnthropic_MODEL_NAME = claude-3-5-sonnet-20241022 Connector_AiAnthropic_TEMPERATURE = 0.2 Connector_AiAnthropic_MAX_TOKENS = 2000 -# LangDoc configuration -Connector_AiLangdoc_API_URL = https://api.langdock.com/v1/chat/completions -Connector_AiLangdoc_API_SECRET = sk-9KaNH1FfEx7SkTijsFpXeTIc9_xOmoo7e0hW6SqrYavFq_bgjcULa7PXp3kWQpp4gfk8-U0B4L91CP6YpAJxZg -Connector_AiLangdoc_MODEL_NAME = gpt-4o -Connector_AiLangdoc_TEMPERATURE = 0.2 -Connector_AiLangdoc_MAX_TOKENS = 2000 +# Perplexity AI configuration +Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions +Connector_AiPerplexity_API_SECRET = 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 +Connector_AiPerplexity_MODEL_NAME = sonar +Connector_AiPerplexity_TEMPERATURE = 0.2 +Connector_AiPerplexity_MAX_TOKENS = 2000 # Agent Mail configuration Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c diff --git a/env_int.env b/env_int.env index 7ddc210f..4a0f3e39 100644 --- a/env_int.env +++ b/env_int.env @@ -64,12 +64,12 @@ Connector_AiAnthropic_MODEL_NAME = claude-3-5-sonnet-20241022 Connector_AiAnthropic_TEMPERATURE = 0.2 Connector_AiAnthropic_MAX_TOKENS = 2000 -# LangDoc configuration -Connector_AiLangdoc_API_URL = https://api.langdock.com/v1/chat/completions -Connector_AiLangdoc_API_SECRET = sk-9KaNH1FfEx7SkTijsFpXeTIc9_xOmoo7e0hW6SqrYavFq_bgjcULa7PXp3kWQpp4gfk8-U0B4L91CP6YpAJxZg -Connector_AiLangdoc_MODEL_NAME = gpt-4o -Connector_AiLangdoc_TEMPERATURE = 0.2 -Connector_AiLangdoc_MAX_TOKENS = 2000 +# Perplexity AI configuration +Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions +Connector_AiPerplexity_API_SECRET = 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 +Connector_AiPerplexity_MODEL_NAME = sonar +Connector_AiPerplexity_TEMPERATURE = 0.2 +Connector_AiPerplexity_MAX_TOKENS = 2000 # Agent Mail configuration Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c diff --git a/env_prod.env b/env_prod.env index d7a0a3bd..c1ba8086 100644 --- a/env_prod.env +++ b/env_prod.env @@ -64,12 +64,12 @@ Connector_AiAnthropic_MODEL_NAME = claude-3-5-sonnet-20241022 Connector_AiAnthropic_TEMPERATURE = 0.2 Connector_AiAnthropic_MAX_TOKENS = 2000 -# LangDoc configuration -Connector_AiLangdoc_API_URL = https://api.langdock.com/v1/chat/completions -Connector_AiLangdoc_API_SECRET = sk-9KaNH1FfEx7SkTijsFpXeTIc9_xOmoo7e0hW6SqrYavFq_bgjcULa7PXp3kWQpp4gfk8-U0B4L91CP6YpAJxZg -Connector_AiLangdoc_MODEL_NAME = gpt-4o -Connector_AiLangdoc_TEMPERATURE = 0.2 -Connector_AiLangdoc_MAX_TOKENS = 2000 +# Perplexity AI configuration +Connector_AiPerplexity_API_URL = https://api.perplexity.ai/chat/completions +Connector_AiPerplexity_API_SECRET = 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 +Connector_AiPerplexity_MODEL_NAME = sonar +Connector_AiPerplexity_TEMPERATURE = 0.2 +Connector_AiPerplexity_MAX_TOKENS = 2000 # Agent Mail configuration Service_MSFT_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c diff --git a/modules/connectors/connectorAiAnthropic.py b/modules/connectors/connectorAiAnthropic.py index f9ce2f03..1bcfe289 100644 --- a/modules/connectors/connectorAiAnthropic.py +++ b/modules/connectors/connectorAiAnthropic.py @@ -62,13 +62,52 @@ class AiAnthropic: if maxTokens is None: maxTokens = self.config.get("maxTokens", 2000) + # Transform OpenAI-style messages to Anthropic format: + # - Move any 'system' role content to top-level 'system' + # - Keep only 'user'/'assistant' messages in the list + system_contents: List[str] = [] + converted_messages: List[Dict[str, Any]] = [] + for m in messages: + role = m.get("role") + content = m.get("content", "") + if role == "system": + # Collect system content; Anthropic expects top-level 'system' + if isinstance(content, list): + # Join text parts if provided as blocks + joined = "\n\n".join( + [ + (part.get("text") if isinstance(part, dict) else str(part)) + for part in content + ] + ) + system_contents.append(joined) + else: + system_contents.append(str(content)) + continue + # For Anthropic, content can be a string; pass through strings, collapse blocks + if isinstance(content, list): + # Collapse to text if blocks are provided + collapsed = "\n\n".join( + [ + (part.get("text") if isinstance(part, dict) else str(part)) + for part in content + ] + ) + converted_messages.append({"role": role, "content": collapsed}) + else: + converted_messages.append({"role": role, "content": content}) + + system_prompt = "\n\n".join([s for s in system_contents if s]) if system_contents else None + # Create Anthropic API payload - payload = { + payload: Dict[str, Any] = { "model": self.modelName, - "messages": messages, + "messages": converted_messages, "temperature": temperature, - "max_tokens": maxTokens + "max_tokens": maxTokens, } + if system_prompt: + payload["system"] = system_prompt response = await self.httpClient.post( self.apiUrl, @@ -174,8 +213,8 @@ class AiAnthropic: } ] - # Use the existing callApi function with the Vision model - response = await self.callApi(messages) + # Use the existing callAiBasic function with the Vision model + response = await self.callAiBasic(messages) # Extract and return content return response["choices"][0]["message"]["content"] diff --git a/modules/connectors/connectorAiLangdoc.py b/modules/connectors/connectorAiLangdoc.py deleted file mode 100644 index da91cf92..00000000 --- a/modules/connectors/connectorAiLangdoc.py +++ /dev/null @@ -1,406 +0,0 @@ -import logging -import httpx -import asyncio -import re -from typing import Dict, Any, List, Union, Optional -from fastapi import HTTPException -from modules.shared.configuration import APP_CONFIG - -# Configure logger -logger = logging.getLogger(__name__) - -def loadConfigData(): - """Load configuration data for LangDoc connector""" - return { - "apiKey": APP_CONFIG.get('Connector_AiLangdoc_API_SECRET'), - "apiUrl": APP_CONFIG.get('Connector_AiLangdoc_API_URL'), - "modelName": APP_CONFIG.get('Connector_AiLangdoc_MODEL_NAME'), - "temperature": float(APP_CONFIG.get('Connector_AiLangdoc_TEMPERATURE')), - "maxTokens": int(APP_CONFIG.get('Connector_AiLangdoc_MAX_TOKENS')) - } - -class AiLangdoc: - """Connector for communication with the LangDoc API (OpenAI-compatible).""" - - def __init__(self): - # Load configuration - self.config = loadConfigData() - self.apiKey = self.config["apiKey"] - self.apiUrl = self.config["apiUrl"] - self.modelName = self.config["modelName"] - - # HttpClient for API calls - self.httpClient = httpx.AsyncClient( - timeout=120.0, # Longer timeout for complex requests - headers={ - "Authorization": f"Bearer {self.apiKey}", - "Content-Type": "application/json" - } - ) - - logger.info(f"LangDoc Connector initialized with model: {self.modelName}") - - async def callAiBasic(self, messages: List[Dict[str, Any]], temperature: float = None, maxTokens: int = None) -> str: - """ - Calls the LangDoc API with the given messages. - - Args: - messages: List of messages in OpenAI format (role, content) - temperature: Temperature for response generation (0.0-1.0) - maxTokens: Maximum number of tokens in the response - - Returns: - The response from the LangDoc API - - Raises: - HTTPException: For errors in API communication - """ - try: - # Use parameters from configuration if none were overridden - if temperature is None: - temperature = self.config.get("temperature", 0.2) - - if maxTokens is None: - maxTokens = self.config.get("maxTokens", 2000) - - payload = { - "model": self.modelName, - "messages": messages, - "temperature": temperature, - "max_tokens": maxTokens - } - - response = await self.httpClient.post( - self.apiUrl, - json=payload - ) - - if response.status_code != 200: - error_detail = f"LangDoc API error: {response.status_code} - {response.text}" - logger.error(error_detail) - - # Provide more specific error messages based on status code - if response.status_code == 429: - error_message = "Rate limit exceeded. Please wait before making another request." - elif response.status_code == 401: - error_message = "Invalid API key. Please check your LangDoc API configuration." - elif response.status_code == 400: - error_message = f"Invalid request to LangDoc API: {response.text}" - else: - error_message = f"LangDoc API error ({response.status_code}): {response.text}" - - raise HTTPException(status_code=500, detail=error_message) - - responseJson = response.json() - content = responseJson["choices"][0]["message"]["content"] - return content - - except Exception as e: - logger.error(f"Error calling LangDoc API: {str(e)}") - raise HTTPException(status_code=500, detail=f"Error calling LangDoc API: {str(e)}") - - async def callAiImage(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None) -> str: - """ - Analyzes an image using LangDoc's vision capabilities. - - Args: - imageData: Either a file path (str) or image data (bytes) - mimeType: The MIME type of the image (optional, only for binary data) - prompt: The prompt for analysis - - Returns: - The analysis response as text - """ - try: - # Distinguish between file path and binary data - if isinstance(imageData, str): - # It's a file path - import filehandling only when needed - from modules import agentserviceFilemanager as fileHandler - base64Data, autoMimeType = fileHandler.encodeFileToBase64(imageData) - mimeType = mimeType or autoMimeType - else: - # It's binary data - import base64 - base64Data = base64.b64encode(imageData).decode('utf-8') - # MIME type must be specified for binary data - if not mimeType: - # Fallback to generic image type - mimeType = "image/png" - - # Prepare the payload for the Vision API - messages = [ - { - "role": "user", - "content": [ - {"type": "text", "text": prompt}, - { - "type": "image_url", - "image_url": { - "url": f"data:{mimeType};base64,{base64Data}" - } - } - ] - } - ] - - # Use the existing callAiBasic function - response = await self.callAiBasic(messages) - - return response - - except Exception as e: - logger.error(f"Error during image analysis: {str(e)}", exc_info=True) - return f"[Error during image analysis: {str(e)}]" - - async def listModels(self) -> List[Dict[str, Any]]: - """ - Lists available models from the LangDoc API. - - Returns: - List of available models with their details - """ - try: - # LangDoc uses OpenAI-compatible endpoints - modelsUrl = self.apiUrl.replace("/chat/completions", "/models") - - response = await self.httpClient.get(modelsUrl) - - if response.status_code != 200: - error_detail = f"LangDoc API error listing models: {response.status_code} - {response.text}" - logger.error(error_detail) - raise HTTPException(status_code=500, detail=error_detail) - - responseJson = response.json() - return responseJson.get("data", []) - - except Exception as e: - logger.error(f"Error listing LangDoc models: {str(e)}") - raise HTTPException(status_code=500, detail=f"Error listing LangDoc models: {str(e)}") - - async def getModelInfo(self, modelName: str = None) -> Dict[str, Any]: - """ - Gets information about a specific model. - - Args: - modelName: Name of the model to get info for (uses default if None) - - Returns: - Model information dictionary - """ - try: - if modelName is None: - modelName = self.modelName - - models = await self.listModels() - - for model in models: - if model.get("id") == modelName: - return model - - raise HTTPException(status_code=404, detail=f"Model {modelName} not found") - - except Exception as e: - logger.error(f"Error getting LangDoc model info: {str(e)}") - raise HTTPException(status_code=500, detail=f"Error getting LangDoc model info: {str(e)}") - - async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid") -> Dict[str, Any]: - """ - Generates an image using LangDoc's DALL-E 3 integration. - - Args: - prompt: Text description of the image to generate - size: Image size - "1024x1024", "1792x1024", or "1024x1792" - quality: Image quality - "standard" or "hd" - style: Image style - "vivid" or "natural" - - Returns: - Dictionary containing the generated image data and metadata - - Raises: - HTTPException: For errors in API communication - """ - try: - # Use OpenAI-compatible images endpoint - imagesUrl = self.apiUrl.replace("/chat/completions", "/images/generations") - - payload = { - "model": "dall-e-3", - "prompt": prompt, - "size": size, - "quality": quality, - "style": style, - "n": 1 - } - - response = await self.httpClient.post( - imagesUrl, - json=payload - ) - - if response.status_code != 200: - error_detail = f"LangDoc Image Generation API error: {response.status_code} - {response.text}" - logger.error(error_detail) - - # Provide more specific error messages - if response.status_code == 429: - error_message = "Rate limit exceeded for image generation. Please wait before making another request." - elif response.status_code == 401: - error_message = "Invalid API key for image generation. Please check your LangDoc API configuration." - elif response.status_code == 400: - error_message = f"Invalid request to LangDoc Image API: {response.text}" - else: - error_message = f"LangDoc Image API error ({response.status_code}): {response.text}" - - raise HTTPException(status_code=500, detail=error_message) - - responseJson = response.json() - - # Extract image data - imageData = responseJson.get("data", []) - if not imageData: - raise HTTPException(status_code=500, detail="No image data returned from LangDoc API") - - imageInfo = imageData[0] - - return { - "success": True, - "image_url": imageInfo.get("url"), - "revised_prompt": imageInfo.get("revised_prompt"), - "size": size, - "quality": quality, - "style": style, - "model": "dall-e-3", - "created": responseJson.get("created"), - "raw_response": responseJson - } - - except Exception as e: - logger.error(f"Error generating image with LangDoc: {str(e)}") - raise HTTPException(status_code=500, detail=f"Error generating image with LangDoc: {str(e)}") - - async def generateImageWithVariations(self, prompt: str, variations: int = 1, size: str = "1024x1024", quality: str = "standard", style: str = "vivid") -> List[Dict[str, Any]]: - """ - Generates multiple image variations using LangDoc's DALL-E 3 integration. - - Args: - prompt: Text description of the image to generate - variations: Number of variations to generate (1-4) - size: Image size - "1024x1024", "1792x1024", or "1024x1792" - quality: Image quality - "standard" or "hd" - style: Image style - "vivid" or "natural" - - Returns: - List of dictionaries containing generated image data and metadata - - Raises: - HTTPException: For errors in API communication - """ - try: - # Limit variations to reasonable number - variations = min(max(variations, 1), 4) - - # Use OpenAI-compatible images endpoint - imagesUrl = self.apiUrl.replace("/chat/completions", "/images/generations") - - results = [] - - # Generate multiple variations by making multiple API calls - for i in range(variations): - # Add variation to prompt to get different results - variationPrompt = f"{prompt} (variation {i+1})" - - payload = { - "model": "dall-e-3", - "prompt": variationPrompt, - "size": size, - "quality": quality, - "style": style, - "n": 1 - } - - response = await self.httpClient.post( - imagesUrl, - json=payload - ) - - if response.status_code != 200: - logger.warning(f"Failed to generate variation {i+1}: {response.status_code} - {response.text}") - continue - - responseJson = response.json() - imageData = responseJson.get("data", []) - - if imageData: - imageInfo = imageData[0] - results.append({ - "variation": i + 1, - "image_url": imageInfo.get("url"), - "revised_prompt": imageInfo.get("revised_prompt"), - "size": size, - "quality": quality, - "style": style, - "model": "dall-e-3", - "created": responseJson.get("created") - }) - - # Add small delay between requests to avoid rate limiting - if i < variations - 1: - await asyncio.sleep(1) - - return results - - except Exception as e: - logger.error(f"Error generating image variations with LangDoc: {str(e)}") - raise HTTPException(status_code=500, detail=f"Error generating image variations with LangDoc: {str(e)}") - - async def generateImageWithChat(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid") -> str: - """ - Generates an image using LangDoc's chat interface with image generation tools. - This method uses the chat completions endpoint with image generation capabilities. - - Args: - prompt: Text description of the image to generate - size: Image size - "1024x1024", "1792x1024", or "1024x1792" - quality: Image quality - "standard" or "hd" - style: Image style - "vivid" or "natural" - - Returns: - Response text from the chat model (may include image references) - - Raises: - HTTPException: For errors in API communication - """ - try: - # Create a prompt that requests image generation - imagePrompt = f"Please generate an image with the following description: {prompt}. Size: {size}, Quality: {quality}, Style: {style}" - - messages = [ - { - "role": "user", - "content": imagePrompt - } - ] - - # Use the chat completions endpoint - response = await self.callAiBasic(messages) - - return response - - except Exception as e: - logger.error(f"Error generating image with chat: {str(e)}") - raise HTTPException(status_code=500, detail=f"Error generating image with chat: {str(e)}") - - async def _testConnection(self) -> bool: - """ - Tests the connection to the LangDoc API. - - Returns: - True if connection is successful, False otherwise - """ - try: - # Try to list models as a simple connection test - await self.listModels() - return True - except Exception as e: - logger.error(f"LangDoc connection test failed: {str(e)}") - return False diff --git a/modules/connectors/connectorAiPerplexity.py b/modules/connectors/connectorAiPerplexity.py new file mode 100644 index 00000000..fc97b885 --- /dev/null +++ b/modules/connectors/connectorAiPerplexity.py @@ -0,0 +1,255 @@ +import logging +import httpx +import asyncio +from typing import Dict, Any, List, Union, Optional +from fastapi import HTTPException +from modules.shared.configuration import APP_CONFIG + +# Configure logger +logger = logging.getLogger(__name__) + +def loadConfigData(): + """Load configuration data for Perplexity connector""" + return { + "apiKey": APP_CONFIG.get('Connector_AiPerplexity_API_SECRET'), + "apiUrl": APP_CONFIG.get('Connector_AiPerplexity_API_URL'), + "modelName": APP_CONFIG.get('Connector_AiPerplexity_MODEL_NAME'), + "temperature": float(APP_CONFIG.get('Connector_AiPerplexity_TEMPERATURE')), + "maxTokens": int(APP_CONFIG.get('Connector_AiPerplexity_MAX_TOKENS')) + } + +class AiPerplexity: + """Connector for communication with the Perplexity API.""" + + def __init__(self): + # Load configuration + self.config = loadConfigData() + self.apiKey = self.config["apiKey"] + self.apiUrl = self.config["apiUrl"] + self.modelName = self.config["modelName"] + + # HttpClient for API calls + self.httpClient = httpx.AsyncClient( + timeout=120.0, # Longer timeout for complex requests + headers={ + "Authorization": f"Bearer {self.apiKey}", + "Content-Type": "application/json", + "Accept": "application/json" + } + ) + + logger.info(f"Perplexity Connector initialized with model: {self.modelName}") + + async def callAiBasic(self, messages: List[Dict[str, Any]], temperature: float = None, maxTokens: int = None) -> str: + """ + Calls the Perplexity API with the given messages. + + Args: + messages: List of messages in OpenAI format (role, content) + temperature: Temperature for response generation (0.0-1.0) + maxTokens: Maximum number of tokens in the response + + Returns: + The response from the Perplexity API + + Raises: + HTTPException: For errors in API communication + """ + try: + # Use parameters from configuration if none were overridden + if temperature is None: + temperature = self.config.get("temperature", 0.2) + + if maxTokens is None: + maxTokens = self.config.get("maxTokens", 2000) + + payload = { + "model": self.modelName, + "messages": messages, + "temperature": temperature, + "max_tokens": maxTokens + } + + response = await self.httpClient.post( + self.apiUrl, + json=payload + ) + + if response.status_code != 200: + error_detail = f"Perplexity API error: {response.status_code} - {response.text}" + logger.error(error_detail) + + # Provide more specific error messages based on status code + if response.status_code == 429: + error_message = "Rate limit exceeded. Please wait before making another request." + elif response.status_code == 401: + error_message = "Invalid API key. Please check your Perplexity API configuration." + elif response.status_code == 400: + error_message = f"Invalid request to Perplexity API: {response.text}" + else: + error_message = f"Perplexity API error ({response.status_code}): {response.text}" + + raise HTTPException(status_code=500, detail=error_message) + + responseJson = response.json() + content = responseJson["choices"][0]["message"]["content"] + return content + + except Exception as e: + logger.error(f"Error calling Perplexity API: {str(e)}") + raise HTTPException(status_code=500, detail=f"Error calling Perplexity API: {str(e)}") + + async def callAiWithWebSearch(self, query: str, temperature: float = None, maxTokens: int = None) -> str: + """ + Calls Perplexity API with web search capabilities for research. + + Args: + query: The research query or question + temperature: Temperature for response generation (0.0-1.0) + maxTokens: Maximum number of tokens in the response + + Returns: + The response from Perplexity with web search context + """ + try: + # Use parameters from configuration if none were overridden + if temperature is None: + temperature = self.config.get("temperature", 0.2) + + if maxTokens is None: + maxTokens = self.config.get("maxTokens", 2000) + + # For web search, we use the configured model name + webSearchModel = self.modelName + + payload = { + "model": webSearchModel, + "messages": [ + { + "role": "user", + "content": query + } + ], + "temperature": temperature, + "max_tokens": maxTokens + } + + response = await self.httpClient.post( + self.apiUrl, + json=payload + ) + + if response.status_code != 200: + error_detail = f"Perplexity Web Search API error: {response.status_code} - {response.text}" + logger.error(error_detail) + + if response.status_code == 429: + error_message = "Rate limit exceeded for web search. Please wait before making another request." + elif response.status_code == 401: + error_message = "Invalid API key for web search. Please check your Perplexity API configuration." + elif response.status_code == 400: + error_message = f"Invalid request to Perplexity Web Search API: {response.text}" + else: + error_message = f"Perplexity Web Search API error ({response.status_code}): {response.text}" + + raise HTTPException(status_code=500, detail=error_message) + + responseJson = response.json() + content = responseJson["choices"][0]["message"]["content"] + return content + + except Exception as e: + logger.error(f"Error calling Perplexity Web Search API: {str(e)}") + raise HTTPException(status_code=500, detail=f"Error calling Perplexity Web Search API: {str(e)}") + + async def researchTopic(self, topic: str, depth: str = "basic") -> str: + """ + Research a topic using Perplexity's web search capabilities. + + Args: + topic: The topic to research + depth: Research depth - "basic", "detailed", or "comprehensive" + + Returns: + Comprehensive research results on the topic + """ + try: + # Create research prompts based on depth + if depth == "basic": + prompt = f"Provide a basic overview of: {topic}" + elif depth == "detailed": + prompt = f"Provide a detailed analysis of: {topic}. Include recent developments, key facts, and important information." + else: # comprehensive + prompt = f"Provide a comprehensive research report on: {topic}. Include recent developments, key facts, statistics, expert opinions, and current trends." + + return await self.callAiWithWebSearch(prompt) + + except Exception as e: + logger.error(f"Error researching topic: {str(e)}") + raise HTTPException(status_code=500, detail=f"Error researching topic: {str(e)}") + + async def answerQuestion(self, question: str, context: str = None) -> str: + """ + Answer a question using web search for current information. + + Args: + question: The question to answer + context: Optional context to provide + + Returns: + Answer with web search context + """ + try: + if context: + prompt = f"Context: {context}\n\nQuestion: {question}\n\nPlease provide a comprehensive answer using current information from the web." + else: + prompt = f"Question: {question}\n\nPlease provide a comprehensive answer using current information from the web." + + return await self.callAiWithWebSearch(prompt) + + except Exception as e: + logger.error(f"Error answering question: {str(e)}") + raise HTTPException(status_code=500, detail=f"Error answering question: {str(e)}") + + async def getCurrentNews(self, topic: str = None, limit: int = 5) -> str: + """ + Get current news on a specific topic. + + Args: + topic: The topic to get news about (optional) + limit: Number of news items to retrieve + + Returns: + Current news information + """ + try: + if topic: + prompt = f"Get the latest news about {topic}. Provide {limit} recent news items with sources and dates." + else: + prompt = f"Get the latest news. Provide {limit} recent news items with sources and dates." + + return await self.callAiWithWebSearch(prompt) + + except Exception as e: + logger.error(f"Error getting current news: {str(e)}") + raise HTTPException(status_code=500, detail=f"Error getting current news: {str(e)}") + + async def _testConnection(self) -> bool: + """ + Tests the connection to the Perplexity API. + + Returns: + True if connection is successful, False otherwise + """ + try: + # Try a simple test message + testMessages = [ + {"role": "user", "content": "Hello, please respond with just 'OK' to confirm the connection works."} + ] + + response = await self.callAiBasic(testMessages) + return response and len(response.strip()) > 0 + + except Exception as e: + logger.error(f"Perplexity connection test failed: {str(e)}") + return False diff --git a/modules/connectors/connectorAiTavily.py b/modules/connectors/connectorAiTavily.py index 10ac105d..f86c49b2 100644 --- a/modules/connectors/connectorAiTavily.py +++ b/modules/connectors/connectorAiTavily.py @@ -4,6 +4,7 @@ import logging import asyncio from dataclasses import dataclass +from typing import Optional from tavily import AsyncTavilyClient from modules.shared.configuration import APP_CONFIG from modules.shared.timezoneUtils import get_utc_timestamp @@ -29,6 +30,7 @@ logger = logging.getLogger(__name__) class WebSearchResult: title: str url: str + raw_content: Optional[str] = None @dataclass class WebCrawlResult: @@ -83,7 +85,11 @@ class ConnectorWeb: return WebSearchActionResult(success=False, error=str(e)) result_items = [ - WebSearchResultItem(title=result.title, url=result.url) + WebSearchResultItem( + title=result.title, + url=result.url, + raw_content=getattr(result, 'raw_content', None) + ) for result in raw_results ] @@ -246,6 +252,15 @@ class ConnectorWeb: urls = [result.url for result in search_results] return await self._crawl(urls, extract_depth=extract_depth, format=format) + def _clean_url(self, url: str) -> str: + """Clean URL by removing extra text that might be appended.""" + import re + # Extract just the URL part, removing any extra text after it + url_match = re.match(r'(https?://[^\s,]+)', url) + if url_match: + return url_match.group(1) + return url + async def _search( self, query: str, @@ -289,7 +304,11 @@ class ConnectorWeb: response = await self.client.search(**kwargs) return [ - WebSearchResult(title=result["title"], url=result["url"]) + WebSearchResult( + title=result["title"], + url=self._clean_url(result["url"]), + raw_content=result.get("raw_content") + ) for result in response["results"] ] @@ -304,26 +323,53 @@ class ConnectorWeb: retry_delay = self.crawl_retry_delay timeout = self.crawl_timeout + logger.debug(f"Starting crawl of {len(urls)} URLs: {urls}") + logger.debug(f"Crawl settings: extract_depth={extract_depth}, format={format}, timeout={timeout}s") + for attempt in range(max_retries + 1): try: + logger.debug(f"Crawl attempt {attempt + 1}/{max_retries + 1}") + # Use asyncio.wait_for for timeout # Build kwargs for extract kwargs_extract: dict = {"urls": urls} kwargs_extract["extract_depth"] = extract_depth or "advanced" - kwargs_extract["format"] = format or "text" + kwargs_extract["format"] = format or "markdown" # Use markdown to get HTML structure + logger.debug(f"Sending request to Tavily with kwargs: {kwargs_extract}") + response = await asyncio.wait_for( self.client.extract(**kwargs_extract), timeout=timeout ) - return [ - WebCrawlResult(url=result["url"], content=result["raw_content"]) + logger.debug(f"Tavily response received: {list(response.keys())}") + + # Debug: Log what Tavily actually returns + if "results" in response and response["results"]: + logger.debug(f"Tavily returned {len(response['results'])} results") + logger.debug(f"First result keys: {list(response['results'][0].keys())}") + logger.debug(f"First result has raw_content: {'raw_content' in response['results'][0]}") + + # Log each result + for i, result in enumerate(response["results"]): + logger.debug(f"Result {i+1}: URL={result.get('url', 'N/A')}, content_length={len(result.get('raw_content', result.get('content', '')))}") + else: + logger.warning(f"Tavily returned no results in response: {response}") + + results = [ + WebCrawlResult( + url=result["url"], + content=result.get("raw_content", result.get("content", "")) # Try raw_content first, fallback to content + ) for result in response["results"] ] + logger.debug(f"Crawl successful: extracted {len(results)} results") + return results + except asyncio.TimeoutError: - logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds") + logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URLs: {urls}") if attempt < max_retries: logger.info(f"Retrying in {retry_delay} seconds...") await asyncio.sleep(retry_delay) @@ -331,7 +377,22 @@ class ConnectorWeb: raise Exception(f"Crawl failed after {max_retries + 1} attempts due to timeout") except Exception as e: - logger.warning(f"Crawl attempt {attempt + 1} failed: {str(e)}") + logger.warning(f"Crawl attempt {attempt + 1} failed for URLs {urls}: {str(e)}") + logger.debug(f"Full error details: {type(e).__name__}: {str(e)}") + + # Check if it's a validation error and log more details + if "validation" in str(e).lower(): + logger.debug(f"URL validation failed. Checking URL format:") + for i, url in enumerate(urls): + logger.debug(f" URL {i+1}: '{url}' (length: {len(url)})") + # Check for common URL issues + if ' ' in url: + logger.debug(f" WARNING: URL contains spaces!") + if not url.startswith(('http://', 'https://')): + logger.debug(f" WARNING: URL doesn't start with http/https!") + if len(url) > 2000: + logger.debug(f" WARNING: URL is very long ({len(url)} chars)") + if attempt < max_retries: logger.info(f"Retrying in {retry_delay} seconds...") await asyncio.sleep(retry_delay) diff --git a/modules/connectors/connectorDbPostgre.py b/modules/connectors/connectorDbPostgre.py index 236cb796..dd08f4e0 100644 --- a/modules/connectors/connectorDbPostgre.py +++ b/modules/connectors/connectorDbPostgre.py @@ -384,6 +384,57 @@ class DatabaseConnector: logger.info( f"Created table '{table}' with columns from Pydantic model" ) + else: + # Table exists: ensure all columns from model are present (simple additive migration) + try: + cursor.execute( + """ + SELECT column_name FROM information_schema.columns + WHERE LOWER(table_name) = LOWER(%s) AND table_schema = 'public' + """, + (table,), + ) + existing_columns = { + row["column_name"] for row in cursor.fetchall() + } + + # Desired columns based on model + model_fields = _get_model_fields(model_class) + desired_columns = ( + set(["id"]) + | set(model_fields.keys()) + | {"_createdAt", "_modifiedAt", "_createdBy", "_modifiedBy"} + ) + + # Add missing columns + for col in sorted(desired_columns - existing_columns): + # Determine SQL type + if col in ["id"]: + continue # primary key exists already + sql_type = model_fields.get(col) + if col in ["_createdAt"]: + sql_type = "DOUBLE PRECISION" + elif col in ["_modifiedAt"]: + sql_type = "DOUBLE PRECISION" + elif col in ["_createdBy", "_modifiedBy"]: + sql_type = "VARCHAR(255)" + if not sql_type: + sql_type = "TEXT" + try: + cursor.execute( + f'ALTER TABLE "{table}" ADD COLUMN "{col}" {sql_type}' + ) + logger.info( + f"Added missing column '{col}' ({sql_type}) to '{table}'" + ) + except Exception as add_err: + logger.warning( + f"Could not add column '{col}' to '{table}': {add_err}" + ) + except Exception as ensure_err: + logger.warning( + f"Could not ensure columns for existing table '{table}': {ensure_err}" + ) self.connection.commit() return True diff --git a/modules/datamodels/__init__.py b/modules/datamodels/__init__.py index bc18cabd..2ddc1189 100644 --- a/modules/datamodels/__init__.py +++ b/modules/datamodels/__init__.py @@ -10,7 +10,6 @@ from . import datamodelWeb as web from . import datamodelUam as uam from . import datamodelSecurity as security from . import datamodelNeutralizer as neutralizer -from . import datamodelWorkflow as workflow from . import datamodelChat as chat from . import datamodelFiles as files from . import datamodelVoice as voice diff --git a/modules/datamodels/datamodelAi.py b/modules/datamodels/datamodelAi.py index bab071b6..ad06f785 100644 --- a/modules/datamodels/datamodelAi.py +++ b/modules/datamodels/datamodelAi.py @@ -111,6 +111,11 @@ class AiCallOptions(BaseModel): callType: Literal["planning", "text"] = Field(default="text", description="Call type: planning or text") safetyMargin: float = Field(default=0.1, ge=0.0, le=0.5, description="Safety margin for token limits (0.0-0.5)") modelCapabilities: Optional[List[str]] = Field(default=None, description="Required model capabilities for filtering") + + # Model generation parameters + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0, description="Temperature for response generation (0.0-2.0, lower = more consistent)") + maxTokens: Optional[int] = Field(default=None, ge=1, le=32000, description="Maximum tokens in response") + maxParts: Optional[int] = Field(default=1000, ge=1, le=1000, description="Maximum number of continuation parts to fetch") class AiCallRequest(BaseModel): diff --git a/modules/datamodels/datamodelChat.py b/modules/datamodels/datamodelChat.py index 62fa691a..31e65004 100644 --- a/modules/datamodels/datamodelChat.py +++ b/modules/datamodels/datamodelChat.py @@ -169,7 +169,7 @@ register_model_labels( ) -class ExtractedContent(BaseModel, ModelMixin): +class ChatContentExtracted(BaseModel, ModelMixin): id: str = Field(description="Reference to source ChatDocument") contents: List[ContentItem] = Field( default_factory=list, description="List of content items" @@ -177,7 +177,7 @@ class ExtractedContent(BaseModel, ModelMixin): register_model_labels( - "ExtractedContent", + "ChatContentExtracted", {"en": "Extracted Content", "fr": "Contenu extrait"}, { "id": {"en": "Object ID", "fr": "ID de l'objet"}, @@ -201,6 +201,9 @@ class ChatMessage(BaseModel, ModelMixin): None, description="Label for the set of documents" ) message: Optional[str] = Field(None, description="Message content") + summary: Optional[str] = Field( + None, description="Short summary of this message for planning/history" + ) role: str = Field(description="Role of the message sender") status: str = Field(description="Status of the message (first, step, last)") sequenceNr: int = Field( @@ -244,6 +247,7 @@ register_model_labels( "documents": {"en": "Documents", "fr": "Documents"}, "documentsLabel": {"en": "Documents Label", "fr": "Label des documents"}, "message": {"en": "Message", "fr": "Message"}, + "summary": {"en": "Summary", "fr": "Résumé"}, "role": {"en": "Role", "fr": "Rôle"}, "status": {"en": "Status", "fr": "Statut"}, "sequenceNr": {"en": "Sequence Number", "fr": "Numéro de séquence"}, @@ -419,34 +423,6 @@ register_model_labels( ) -class WorkflowResult(BaseModel, ModelMixin): - status: str - completed_tasks: int - total_tasks: int - execution_time: float - final_results_count: int - error: Optional[str] = None - phase: Optional[str] = None - - -register_model_labels( - "WorkflowResult", - {"en": "Workflow Result", "fr": "Résultat du workflow"}, - { - "status": {"en": "Status", "fr": "Statut"}, - "completed_tasks": {"en": "Completed Tasks", "fr": "Tâches terminées"}, - "total_tasks": {"en": "Total Tasks", "fr": "Total des tâches"}, - "execution_time": {"en": "Execution Time", "fr": "Temps d'exécution"}, - "final_results_count": { - "en": "Final Results Count", - "fr": "Nombre de résultats finaux", - }, - "error": {"en": "Error", "fr": "Erreur"}, - "phase": {"en": "Phase", "fr": "Phase"}, - }, -) - - class UserInputRequest(BaseModel, ModelMixin): prompt: str = Field(description="Prompt for the user") listFileId: List[str] = Field(default_factory=list, description="List of file IDs") @@ -462,3 +438,519 @@ register_model_labels( "userLanguage": {"en": "User Language", "fr": "Langue de l'utilisateur"}, }, ) + + +class ActionDocument(BaseModel, ModelMixin): + """Clear document structure for action results""" + + documentName: str = Field(description="Name of the document") + documentData: Any = Field(description="Content/data of the document") + mimeType: str = Field(description="MIME type of the document") + + +register_model_labels( + "ActionDocument", + {"en": "Action Document", "fr": "Document d'action"}, + { + "documentName": {"en": "Document Name", "fr": "Nom du document"}, + "documentData": {"en": "Document Data", "fr": "Données du document"}, + "mimeType": {"en": "MIME Type", "fr": "Type MIME"}, + }, +) + + +class ActionResult(BaseModel, ModelMixin): + """Clean action result with documents as primary output + + IMPORTANT: Action methods should NOT set resultLabel in their return value. + The resultLabel is managed by the action handler using the action's execResultLabel + from the action plan. This ensures consistent document routing throughout the workflow. + """ + + success: bool = Field(description="Whether execution succeeded") + error: Optional[str] = Field(None, description="Error message if failed") + documents: List[ActionDocument] = Field( + default_factory=list, description="Document outputs" + ) + resultLabel: Optional[str] = Field( + None, + description="Label for document routing (set by action handler, not by action methods)", + ) + + @classmethod + def isSuccess(cls, documents: List[ActionDocument] = None) -> "ActionResult": + return cls(success=True, documents=documents or []) + + @classmethod + def isFailure( + cls, error: str, documents: List[ActionDocument] = None + ) -> "ActionResult": + return cls(success=False, documents=documents or [], error=error) + + +register_model_labels( + "ActionResult", + {"en": "Action Result", "fr": "Résultat de l'action"}, + { + "success": {"en": "Success", "fr": "Succès"}, + "error": {"en": "Error", "fr": "Erreur"}, + "documents": {"en": "Documents", "fr": "Documents"}, + "resultLabel": {"en": "Result Label", "fr": "Étiquette du résultat"}, + }, +) + + +class ActionSelection(BaseModel, ModelMixin): + method: str = Field(description="Method to execute (e.g., web, document, ai)") + name: str = Field( + description="Action name within the method (e.g., search, extract)" + ) + + +register_model_labels( + "ActionSelection", + {"en": "Action Selection", "fr": "Sélection d'action"}, + { + "method": {"en": "Method", "fr": "Méthode"}, + "name": {"en": "Action Name", "fr": "Nom de l'action"}, + }, +) + + +class ActionParameters(BaseModel, ModelMixin): + parameters: Dict[str, Any] = Field( + default_factory=dict, description="Parameters to execute the selected action" + ) + + +register_model_labels( + "ActionParameters", + {"en": "Action Parameters", "fr": "Paramètres d'action"}, + { + "parameters": {"en": "Parameters", "fr": "Paramètres"}, + }, +) + + +class ObservationPreview(BaseModel, ModelMixin): + name: str = Field(description="Document name or URL label") + mime: str = Field(description="MIME type or kind") + snippet: str = Field(description="Short snippet or summary") + + +register_model_labels( + "ObservationPreview", + {"en": "Observation Preview", "fr": "Aperçu d'observation"}, + { + "name": {"en": "Name", "fr": "Nom"}, + "mime": {"en": "MIME", "fr": "MIME"}, + "snippet": {"en": "Snippet", "fr": "Extrait"}, + }, +) + + +class Observation(BaseModel, ModelMixin): + success: bool = Field(description="Action execution success flag") + resultLabel: str = Field(description="Deterministic label for produced documents") + documentsCount: int = Field(description="Number of produced documents") + previews: List[ObservationPreview] = Field( + default_factory=list, description="Compact previews of outputs" + ) + notes: List[str] = Field( + default_factory=list, description="Short notes or key facts" + ) + + +register_model_labels( + "Observation", + {"en": "Observation", "fr": "Observation"}, + { + "success": {"en": "Success", "fr": "Succès"}, + "resultLabel": {"en": "Result Label", "fr": "Étiquette du résultat"}, + "documentsCount": {"en": "Documents Count", "fr": "Nombre de documents"}, + "previews": {"en": "Previews", "fr": "Aperçus"}, + "notes": {"en": "Notes", "fr": "Notes"}, + }, +) + + +class TaskStatus(str): + PENDING = "pending" + RUNNING = "running" + COMPLETED = "completed" + FAILED = "failed" + CANCELLED = "cancelled" + + +register_model_labels( + "TaskStatus", + {"en": "Task Status", "fr": "Statut de la tâche"}, + { + "PENDING": {"en": "Pending", "fr": "En attente"}, + "RUNNING": {"en": "Running", "fr": "En cours"}, + "COMPLETED": {"en": "Completed", "fr": "Terminé"}, + "FAILED": {"en": "Failed", "fr": "Échec"}, + "CANCELLED": {"en": "Cancelled", "fr": "Annulé"}, + }, +) + + +class DocumentExchange(BaseModel, ModelMixin): + documentsLabel: str = Field(description="Label for the set of documents") + documents: List[str] = Field( + default_factory=list, description="List of document references" + ) + + +register_model_labels( + "DocumentExchange", + {"en": "Document Exchange", "fr": "Échange de documents"}, + { + "documentsLabel": {"en": "Documents Label", "fr": "Label des documents"}, + "documents": {"en": "Documents", "fr": "Documents"}, + }, +) + + +class ActionItem(BaseModel, ModelMixin): + id: str = Field(..., description="Action ID") + execMethod: str = Field(..., description="Method to execute") + execAction: str = Field(..., description="Action to perform") + execParameters: Dict[str, Any] = Field( + default_factory=dict, description="Action parameters" + ) + execResultLabel: Optional[str] = Field( + None, description="Label for the set of result documents" + ) + expectedDocumentFormats: Optional[List[Dict[str, str]]] = Field( + None, description="Expected document formats (optional)" + ) + userMessage: Optional[str] = Field( + None, description="User-friendly message in user's language" + ) + status: TaskStatus = Field(default=TaskStatus.PENDING, description="Action status") + error: Optional[str] = Field(None, description="Error message if action failed") + retryCount: int = Field(default=0, description="Number of retries attempted") + retryMax: int = Field(default=3, description="Maximum number of retries") + processingTime: Optional[float] = Field( + None, description="Processing time in seconds" + ) + timestamp: float = Field( + ..., description="When the action was executed (UTC timestamp in seconds)" + ) + result: Optional[str] = Field(None, description="Result of the action") + + def setSuccess(self, result: str = None) -> None: + """Set the action as successful with optional result""" + self.status = TaskStatus.COMPLETED + self.error = None + if result is not None: + self.result = result + + def setError(self, error_message: str) -> None: + """Set the action as failed with error message""" + self.status = TaskStatus.FAILED + self.error = error_message + + +register_model_labels( + "ActionItem", + {"en": "Task Action", "fr": "Action de tâche"}, + { + "id": {"en": "Action ID", "fr": "ID de l'action"}, + "execMethod": {"en": "Method", "fr": "Méthode"}, + "execAction": {"en": "Action", "fr": "Action"}, + "execParameters": {"en": "Parameters", "fr": "Paramètres"}, + "execResultLabel": {"en": "Result Label", "fr": "Label du résultat"}, + "expectedDocumentFormats": { + "en": "Expected Document Formats", + "fr": "Formats de documents attendus", + }, + "userMessage": {"en": "User Message", "fr": "Message utilisateur"}, + "status": {"en": "Status", "fr": "Statut"}, + "error": {"en": "Error", "fr": "Erreur"}, + "retryCount": {"en": "Retry Count", "fr": "Nombre de tentatives"}, + "retryMax": {"en": "Max Retries", "fr": "Tentatives max"}, + "processingTime": {"en": "Processing Time", "fr": "Temps de traitement"}, + "timestamp": {"en": "Timestamp", "fr": "Horodatage"}, + "result": {"en": "Result", "fr": "Résultat"}, + }, +) + + +class TaskResult(BaseModel, ModelMixin): + taskId: str = Field(..., description="Task ID") + status: TaskStatus = Field(default=TaskStatus.PENDING, description="Task status") + success: bool = Field(..., description="Whether the task was successful") + feedback: Optional[str] = Field(None, description="Task feedback message") + error: Optional[str] = Field(None, description="Error message if task failed") + + +register_model_labels( + "TaskResult", + {"en": "Task Result", "fr": "Résultat de tâche"}, + { + "taskId": {"en": "Task ID", "fr": "ID de la tâche"}, + "status": {"en": "Status", "fr": "Statut"}, + "success": {"en": "Success", "fr": "Succès"}, + "feedback": {"en": "Feedback", "fr": "Retour"}, + "error": {"en": "Error", "fr": "Erreur"}, + }, +) + + +class TaskItem(BaseModel, ModelMixin): + id: str = Field(..., description="Task ID") + workflowId: str = Field(..., description="Workflow ID") + userInput: str = Field(..., description="User input that triggered the task") + status: TaskStatus = Field(default=TaskStatus.PENDING, description="Task status") + error: Optional[str] = Field(None, description="Error message if task failed") + startedAt: Optional[float] = Field( + None, description="When the task started (UTC timestamp in seconds)" + ) + finishedAt: Optional[float] = Field( + None, description="When the task finished (UTC timestamp in seconds)" + ) + actionList: List[ActionItem] = Field( + default_factory=list, description="List of actions to execute" + ) + retryCount: int = Field(default=0, description="Number of retries attempted") + retryMax: int = Field(default=3, description="Maximum number of retries") + rollbackOnFailure: bool = Field( + default=True, description="Whether to rollback on failure" + ) + dependencies: List[str] = Field( + default_factory=list, description="List of task IDs this task depends on" + ) + feedback: Optional[str] = Field(None, description="Task feedback message") + processingTime: Optional[float] = Field( + None, description="Total processing time in seconds" + ) + resultLabels: Optional[Dict[str, Any]] = Field( + default_factory=dict, description="Map of result labels to their values" + ) + + +register_model_labels( + "TaskItem", + {"en": "Task", "fr": "Tâche"}, + { + "id": {"en": "Task ID", "fr": "ID de la tâche"}, + "workflowId": {"en": "Workflow ID", "fr": "ID du workflow"}, + "userInput": {"en": "User Input", "fr": "Entrée utilisateur"}, + "status": {"en": "Status", "fr": "Statut"}, + "error": {"en": "Error", "fr": "Erreur"}, + "startedAt": {"en": "Started At", "fr": "Démarré à"}, + "finishedAt": {"en": "Finished At", "fr": "Terminé à"}, + "actionList": {"en": "Actions", "fr": "Actions"}, + "retryCount": {"en": "Retry Count", "fr": "Nombre de tentatives"}, + "retryMax": {"en": "Max Retries", "fr": "Tentatives max"}, + "processingTime": {"en": "Processing Time", "fr": "Temps de traitement"}, + }, +) + + +class TaskStep(BaseModel, ModelMixin): + id: str + objective: str + dependencies: Optional[list[str]] = Field(default_factory=list) + success_criteria: Optional[list[str]] = Field(default_factory=list) + estimated_complexity: Optional[str] = None + userMessage: Optional[str] = Field( + None, description="User-friendly message in user's language" + ) + + +register_model_labels( + "TaskStep", + {"en": "Task Step", "fr": "Étape de tâche"}, + { + "id": {"en": "ID", "fr": "ID"}, + "objective": {"en": "Objective", "fr": "Objectif"}, + "dependencies": {"en": "Dependencies", "fr": "Dépendances"}, + "success_criteria": {"en": "Success Criteria", "fr": "Critères de succès"}, + "estimated_complexity": { + "en": "Estimated Complexity", + "fr": "Complexité estimée", + }, + "userMessage": {"en": "User Message", "fr": "Message utilisateur"}, + }, +) + + +class TaskHandover(BaseModel, ModelMixin): + taskId: str = Field(description="Target task ID") + sourceTask: Optional[str] = Field(None, description="Source task ID") + inputDocuments: List[DocumentExchange] = Field( + default_factory=list, description="Available input documents" + ) + outputDocuments: List[DocumentExchange] = Field( + default_factory=list, description="Produced output documents" + ) + context: Dict[str, Any] = Field(default_factory=dict, description="Task context") + previousResults: List[str] = Field( + default_factory=list, description="Previous result summaries" + ) + improvements: List[str] = Field( + default_factory=list, description="Improvement suggestions" + ) + workflowSummary: Optional[str] = Field( + None, description="Summarized workflow context" + ) + messageHistory: List[str] = Field( + default_factory=list, description="Key message summaries" + ) + timestamp: float = Field( + ..., description="When the handover was created (UTC timestamp in seconds)" + ) + handoverType: str = Field( + default="task", description="Type of handover: task, phase, or workflow" + ) + + +register_model_labels( + "TaskHandover", + {"en": "Task Handover", "fr": "Transfert de tâche"}, + { + "taskId": {"en": "Task ID", "fr": "ID de la tâche"}, + "sourceTask": {"en": "Source Task", "fr": "Tâche source"}, + "inputDocuments": {"en": "Input Documents", "fr": "Documents d'entrée"}, + "outputDocuments": {"en": "Output Documents", "fr": "Documents de sortie"}, + "context": {"en": "Context", "fr": "Contexte"}, + "previousResults": {"en": "Previous Results", "fr": "Résultats précédents"}, + "improvements": {"en": "Improvements", "fr": "Améliorations"}, + "workflowSummary": {"en": "Workflow Summary", "fr": "Résumé du workflow"}, + "messageHistory": {"en": "Message History", "fr": "Historique des messages"}, + "timestamp": {"en": "Timestamp", "fr": "Horodatage"}, + "handoverType": {"en": "Handover Type", "fr": "Type de transfert"}, + }, +) + + +class TaskContext(BaseModel, ModelMixin): + task_step: TaskStep + workflow: Optional["ChatWorkflow"] = None + workflow_id: Optional[str] = None + available_documents: Optional[str] = "No documents available" + available_connections: Optional[list[str]] = Field(default_factory=list) + previous_results: Optional[list[str]] = Field(default_factory=list) + previous_handover: Optional[TaskHandover] = None + improvements: Optional[list[str]] = Field(default_factory=list) + retry_count: Optional[int] = 0 + previous_action_results: Optional[list] = Field(default_factory=list) + previous_review_result: Optional[dict] = None + is_regeneration: Optional[bool] = False + failure_patterns: Optional[list[str]] = Field(default_factory=list) + failed_actions: Optional[list] = Field(default_factory=list) + successful_actions: Optional[list] = Field(default_factory=list) + criteria_progress: Optional[dict] = None + + def getDocumentReferences(self) -> List[str]: + docs = [] + if self.previous_handover: + for doc_exchange in self.previous_handover.inputDocuments: + docs.extend(doc_exchange.documents) + return list(set(docs)) + + def addImprovement(self, improvement: str) -> None: + if improvement not in (self.improvements or []): + if self.improvements is None: + self.improvements = [] + self.improvements.append(improvement) + + +class ReviewContext(BaseModel, ModelMixin): + task_step: TaskStep + task_actions: Optional[list] = Field(default_factory=list) + action_results: Optional[list] = Field(default_factory=list) + step_result: Optional[dict] = Field(default_factory=dict) + workflow_id: Optional[str] = None + previous_results: Optional[list[str]] = Field(default_factory=list) + + +class ReviewResult(BaseModel, ModelMixin): + status: str + reason: Optional[str] = None + improvements: Optional[list[str]] = Field(default_factory=list) + quality_score: Optional[int] = 5 + missing_outputs: Optional[list[str]] = Field(default_factory=list) + met_criteria: Optional[list[str]] = Field(default_factory=list) + unmet_criteria: Optional[list[str]] = Field(default_factory=list) + confidence: Optional[float] = 0.5 + userMessage: Optional[str] = Field( + None, description="User-friendly message in user's language" + ) + + +register_model_labels( + "ReviewResult", + {"en": "Review Result", "fr": "Résultat de l'évaluation"}, + { + "status": {"en": "Status", "fr": "Statut"}, + "reason": {"en": "Reason", "fr": "Raison"}, + "improvements": {"en": "Improvements", "fr": "Améliorations"}, + "quality_score": {"en": "Quality Score", "fr": "Score de qualité"}, + "missing_outputs": {"en": "Missing Outputs", "fr": "Sorties manquantes"}, + "met_criteria": {"en": "Met Criteria", "fr": "Critères respectés"}, + "unmet_criteria": {"en": "Unmet Criteria", "fr": "Critères non respectés"}, + "confidence": {"en": "Confidence", "fr": "Confiance"}, + "userMessage": {"en": "User Message", "fr": "Message utilisateur"}, + }, +) + + +class TaskPlan(BaseModel, ModelMixin): + overview: str + tasks: list[TaskStep] + userMessage: Optional[str] = Field( + None, description="Overall user-friendly message for the task plan" + ) + + +register_model_labels( + "TaskPlan", + {"en": "Task Plan", "fr": "Plan de tâches"}, + { + "overview": {"en": "Overview", "fr": "Aperçu"}, + "tasks": {"en": "Tasks", "fr": "Tâches"}, + "userMessage": {"en": "User Message", "fr": "Message utilisateur"}, + }, +) + +# Resolve forward references +TaskContext.update_forward_refs() + + +class PromptPlaceholder(BaseModel, ModelMixin): + label: str + content: str + summaryAllowed: bool = Field( + default=False, + description="Whether host may summarize content before sending to AI", + ) + + +register_model_labels( + "PromptPlaceholder", + {"en": "Prompt Placeholder", "fr": "Espace réservé d'invite"}, + { + "label": {"en": "Label", "fr": "Libellé"}, + "content": {"en": "Content", "fr": "Contenu"}, + "summaryAllowed": {"en": "Summary Allowed", "fr": "Résumé autorisé"}, + }, +) + + +class PromptBundle(BaseModel, ModelMixin): + prompt: str + placeholders: List[PromptPlaceholder] = Field(default_factory=list) + + +register_model_labels( + "PromptBundle", + {"en": "Prompt Bundle", "fr": "Lot d'invite"}, + { + "prompt": {"en": "Prompt", "fr": "Invite"}, + "placeholders": {"en": "Placeholders", "fr": "Espaces réservés"}, + }, +) diff --git a/modules/datamodels/datamodelExtraction.py b/modules/datamodels/datamodelExtraction.py index 61d12977..ff44aa19 100644 --- a/modules/datamodels/datamodelExtraction.py +++ b/modules/datamodels/datamodelExtraction.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Literal from pydantic import BaseModel, Field @@ -12,8 +12,114 @@ class ContentPart(BaseModel): metadata: Dict[str, Any] = Field(default_factory=dict, description="Arbitrary metadata for the part") -class ExtractedContent(BaseModel): +class ContentExtracted(BaseModel): id: str = Field(description="Extraction id or source document id") parts: List[ContentPart] = Field(default_factory=list, description="List of extracted parts") summary: Optional[Dict[str, Any]] = Field(default=None, description="Optional extraction summary") + +class MergeStrategy(BaseModel): + """Strategy configuration for merging content parts and AI results.""" + + # Grouping configuration + groupBy: str = Field( + default="typeGroup", + description="Field to group parts by (typeGroup, parentId, label, etc.)" + ) + + # Ordering configuration + orderBy: str = Field( + default="id", + description="Field to order parts within groups (id, order, pageIndex, etc.)" + ) + + # Merge behavior + mergeType: Literal["concatenate", "hierarchical", "intelligent"] = Field( + default="concatenate", + description="How to merge content within groups" + ) + + # Size limits + maxSize: Optional[int] = Field( + default=None, + description="Maximum size for merged content in bytes" + ) + + # Type-specific merge settings + textMerge: Optional[Dict[str, Any]] = Field( + default=None, + description="Text-specific merge settings (separator, formatting, etc.)" + ) + + tableMerge: Optional[Dict[str, Any]] = Field( + default=None, + description="Table-specific merge settings (header handling, etc.)" + ) + + structureMerge: Optional[Dict[str, Any]] = Field( + default=None, + description="Structure-specific merge settings (hierarchy, etc.)" + ) + + # AI result merging + aiResultMerge: Optional[Dict[str, Any]] = Field( + default=None, + description="AI result merging settings (prompt, context, etc.)" + ) + + # Chunk handling + preserveChunks: bool = Field( + default=False, + description="Whether to preserve individual chunks or merge them" + ) + + chunkSeparator: str = Field( + default="\n\n---\n\n", + description="Separator between chunks when merging" + ) + + # Metadata handling + preserveMetadata: bool = Field( + default=True, + description="Whether to preserve metadata from original parts" + ) + + metadataFields: Optional[List[str]] = Field( + default=None, + description="Specific metadata fields to preserve (None = all)" + ) + + # Error handling + onError: Literal["skip", "include", "fail"] = Field( + default="skip", + description="How to handle errors during merging" + ) + + # Validation + validateContent: bool = Field( + default=True, + description="Whether to validate content before merging" + ) + + def getTypeSpecificSettings(self, typeGroup: str) -> Dict[str, Any]: + """Get type-specific merge settings for a content type.""" + if typeGroup == "text" and self.textMerge: + return self.textMerge + elif typeGroup == "table" and self.tableMerge: + return self.tableMerge + elif typeGroup == "structure" and self.structureMerge: + return self.structureMerge + else: + return {} + + def shouldPreserveChunk(self, chunk: Dict[str, Any]) -> bool: + """Determine if a chunk should be preserved based on strategy.""" + if not self.preserveChunks: + return False + + # Check if chunk has error metadata + if self.onError == "skip" and chunk.get("metadata", {}).get("error"): + return False + + return True + diff --git a/modules/datamodels/datamodelFiles.py b/modules/datamodels/datamodelFiles.py index d561c79e..41cb0cb6 100644 --- a/modules/datamodels/datamodelFiles.py +++ b/modules/datamodels/datamodelFiles.py @@ -19,8 +19,6 @@ class FileItem(BaseModel, ModelMixin): def to_dict(self) -> Dict[str, Any]: return super().to_dict() - - register_model_labels( "FileItem", {"en": "File Item", "fr": "Élément de fichier"}, @@ -35,7 +33,6 @@ register_model_labels( }, ) - class FilePreview(BaseModel, ModelMixin): content: Union[str, bytes] = Field(description="File content (text or binary)") mimeType: str = Field(description="MIME type of the file") @@ -49,8 +46,6 @@ class FilePreview(BaseModel, ModelMixin): if isinstance(data.get("content"), bytes): data["content"] = base64.b64encode(data["content"]).decode("utf-8") return data - - register_model_labels( "FilePreview", {"en": "File Preview", "fr": "Aperçu du fichier"}, @@ -64,13 +59,10 @@ register_model_labels( }, ) - class FileData(BaseModel, ModelMixin): id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key") data: str = Field(description="File data content") base64Encoded: bool = Field(description="Whether the data is base64 encoded") - - register_model_labels( "FileData", {"en": "File Data", "fr": "Données de fichier"}, @@ -80,5 +72,3 @@ register_model_labels( "base64Encoded": {"en": "Base64 Encoded", "fr": "Encodé en Base64"}, }, ) - - diff --git a/modules/datamodels/datamodelNeutralizer.py b/modules/datamodels/datamodelNeutralizer.py index 475b1146..998cb17f 100644 --- a/modules/datamodels/datamodelNeutralizer.py +++ b/modules/datamodels/datamodelNeutralizer.py @@ -14,8 +14,6 @@ class DataNeutraliserConfig(BaseModel, ModelMixin): namesToParse: str = Field(default="", description="Multiline list of names to parse for neutralization", frontend_type="textarea", frontend_readonly=False, frontend_required=False) sharepointSourcePath: str = Field(default="", description="SharePoint path to read files for neutralization", frontend_type="text", frontend_readonly=False, frontend_required=False) sharepointTargetPath: str = Field(default="", description="SharePoint path to store neutralized files", frontend_type="text", frontend_readonly=False, frontend_required=False) - - register_model_labels( "DataNeutraliserConfig", {"en": "Data Neutralization Config", "fr": "Configuration de neutralisation des données"}, @@ -30,7 +28,6 @@ register_model_labels( }, ) - class DataNeutralizerAttributes(BaseModel, ModelMixin): id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Unique ID of the attribute mapping (used as UID in neutralized files)", frontend_type="text", frontend_readonly=True, frontend_required=False) mandateId: str = Field(description="ID of the mandate this attribute belongs to", frontend_type="text", frontend_readonly=True, frontend_required=True) @@ -38,8 +35,6 @@ class DataNeutralizerAttributes(BaseModel, ModelMixin): originalText: str = Field(description="Original text that was neutralized", frontend_type="text", frontend_readonly=True, frontend_required=True) fileId: Optional[str] = Field(default=None, description="ID of the file this attribute belongs to", frontend_type="text", frontend_readonly=True, frontend_required=False) patternType: str = Field(description="Type of pattern that matched (email, phone, name, etc.)", frontend_type="text", frontend_readonly=True, frontend_required=True) - - register_model_labels( "DataNeutralizerAttributes", {"en": "Neutralized Data Attribute", "fr": "Attribut de données neutralisées"}, diff --git a/modules/datamodels/datamodelSecurity.py b/modules/datamodels/datamodelSecurity.py index fa8b8ed7..6184d236 100644 --- a/modules/datamodels/datamodelSecurity.py +++ b/modules/datamodels/datamodelSecurity.py @@ -47,7 +47,8 @@ class Token(BaseModel, ModelMixin): None, description="Mandate ID for tenant scoping of the token" ) - model_config = ConfigDict(use_enum_values=True) + class Config: + use_enum_values = True register_model_labels( diff --git a/modules/datamodels/datamodelTickets.py b/modules/datamodels/datamodelTickets.py index d11606c6..40478bc6 100644 --- a/modules/datamodels/datamodelTickets.py +++ b/modules/datamodels/datamodelTickets.py @@ -9,7 +9,6 @@ class TicketFieldAttribute(BaseModel): fieldName: str = Field(description="Human-readable field name") field: str = Field(description="Ticket field ID/key") - class TicketBase(ABC): @abstractmethod async def read_attributes(self) -> list[TicketFieldAttribute]: ... diff --git a/modules/datamodels/datamodelUam.py b/modules/datamodels/datamodelUam.py index 283ff882..8bd24d8c 100644 --- a/modules/datamodels/datamodelUam.py +++ b/modules/datamodels/datamodelUam.py @@ -13,20 +13,17 @@ class AuthAuthority(str, Enum): GOOGLE = "google" MSFT = "msft" - class UserPrivilege(str, Enum): SYSADMIN = "sysadmin" ADMIN = "admin" USER = "user" - class ConnectionStatus(str, Enum): ACTIVE = "active" EXPIRED = "expired" REVOKED = "revoked" PENDING = "pending" - class Mandate(BaseModel, ModelMixin): id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Unique ID of the mandate", frontend_type="text", frontend_readonly=True, frontend_required=False) name: str = Field(description="Name of the mandate", frontend_type="text", frontend_readonly=False, frontend_required=True) @@ -37,8 +34,6 @@ class Mandate(BaseModel, ModelMixin): {"value": "it", "label": {"en": "Italiano", "fr": "Italien"}}, ]) enabled: bool = Field(default=True, description="Indicates whether the mandate is enabled", frontend_type="checkbox", frontend_readonly=False, frontend_required=False) - - register_model_labels( "Mandate", {"en": "Mandate", "fr": "Mandat"}, @@ -50,7 +45,6 @@ register_model_labels( }, ) - class UserConnection(BaseModel, ModelMixin): id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Unique ID of the connection", frontend_type="text", frontend_readonly=True, frontend_required=False) userId: str = Field(description="ID of the user this connection belongs to", frontend_type="text", frontend_readonly=True, frontend_required=False) @@ -77,8 +71,6 @@ class UserConnection(BaseModel, ModelMixin): {"value": "none", "label": {"en": "None", "fr": "Aucun"}}, ]) tokenExpiresAt: Optional[float] = Field(None, description="When the current token expires (UTC timestamp in seconds)", frontend_type="timestamp", frontend_readonly=True, frontend_required=False) - - register_model_labels( "UserConnection", {"en": "User Connection", "fr": "Connexion utilisateur"}, @@ -98,7 +90,6 @@ register_model_labels( }, ) - class User(BaseModel, ModelMixin): id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Unique ID of the user", frontend_type="text", frontend_readonly=True, frontend_required=False) username: str = Field(description="Username for login", frontend_type="text", frontend_readonly=False, frontend_required=True) @@ -122,8 +113,6 @@ class User(BaseModel, ModelMixin): {"value": "msft", "label": {"en": "Microsoft", "fr": "Microsoft"}}, ]) mandateId: Optional[str] = Field(None, description="ID of the mandate this user belongs to", frontend_type="text", frontend_readonly=True, frontend_required=False) - - register_model_labels( "User", {"en": "User", "fr": "Utilisateur"}, @@ -140,15 +129,10 @@ register_model_labels( }, ) - class UserInDB(User): hashedPassword: Optional[str] = Field(None, description="Hash of the user password") - - register_model_labels( "UserInDB", {"en": "User Access", "fr": "Accès de l'utilisateur"}, {"hashedPassword": {"en": "Password hash", "fr": "Hachage de mot de passe"}}, ) - - diff --git a/modules/datamodels/datamodelUtils.py b/modules/datamodels/datamodelUtils.py index 82a888e7..ccd01c04 100644 --- a/modules/datamodels/datamodelUtils.py +++ b/modules/datamodels/datamodelUtils.py @@ -10,8 +10,6 @@ class Prompt(BaseModel, ModelMixin): mandateId: str = Field(description="ID of the mandate this prompt belongs to", frontend_type="text", frontend_readonly=True, frontend_required=False) content: str = Field(description="Content of the prompt", frontend_type="textarea", frontend_readonly=False, frontend_required=True) name: str = Field(description="Name of the prompt", frontend_type="text", frontend_readonly=False, frontend_required=True) - - register_model_labels( "Prompt", {"en": "Prompt", "fr": "Invite"}, diff --git a/modules/datamodels/datamodelVoice.py b/modules/datamodels/datamodelVoice.py index 3fc69cd8..274a507b 100644 --- a/modules/datamodels/datamodelVoice.py +++ b/modules/datamodels/datamodelVoice.py @@ -22,7 +22,6 @@ class VoiceSettings(BaseModel, ModelMixin): def to_dict(self) -> Dict[str, Any]: return super().to_dict() - register_model_labels( "VoiceSettings", {"en": "Voice Settings", "fr": "Paramètres vocaux"}, diff --git a/modules/datamodels/datamodelWeb.py b/modules/datamodels/datamodelWeb.py index 26f782ec..bc1e03e3 100644 --- a/modules/datamodels/datamodelWeb.py +++ b/modules/datamodels/datamodelWeb.py @@ -1,10 +1,8 @@ """Web-related modules""" - -from abc import ABC, abstractmethod from pydantic import BaseModel, Field, HttpUrl -from typing import List, Optional, Literal +from typing import List, Optional, Literal, Dict, Any from modules.shared.configuration import APP_CONFIG -from modules.datamodels.datamodelWorkflow import ActionDocument, ActionResult +from modules.datamodels.datamodelChat import ActionDocument, ActionResult WEB_SEARCH_MAX_QUERY_LENGTH: int = int(APP_CONFIG.get("Web_Search_MAX_QUERY_LENGTH", "400")) @@ -12,130 +10,133 @@ WEB_SEARCH_MAX_RESULTS: int = int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20") WEB_SEARCH_MIN_RESULTS: int = int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1")) -class WebSearchRequest(BaseModel): - query: str = Field(min_length=1, max_length=WEB_SEARCH_MAX_QUERY_LENGTH) - max_results: int = Field(ge=WEB_SEARCH_MIN_RESULTS, le=WEB_SEARCH_MAX_RESULTS) - # Tavily tuning options - search_depth: Optional[Literal["basic", "advanced"]] = Field(default=None) - time_range: Optional[Literal["d", "w", "m", "y"]] = Field( - default=None, description="Limit results to last day/week/month/year" - ) - topic: Optional[Literal["general", "news", "academic"]] = Field(default=None) - include_domains: Optional[List[str]] = Field(default=None) - exclude_domains: Optional[List[str]] = Field(default=None) - language: Optional[str] = Field(default=None, description="ISO language code like 'en', 'de'") - include_answer: Optional[bool] = Field(default=None) - include_raw_content: Optional[bool] = Field(default=None) +class WebResearchOptions(BaseModel): + """Advanced options for web research workflow""" + max_pages: int = Field(default=10, ge=1, le=50, description="Maximum pages to crawl") + search_depth: Literal["basic", "advanced"] = Field(default="basic", description="Tavily search depth") + extract_depth: Literal["basic", "advanced"] = Field(default="advanced", description="Tavily extract depth") + format: Literal["text", "markdown"] = Field(default="markdown", description="Content format") + return_report: bool = Field(default=True, description="Return formatted report or raw data") + pages_search_depth: int = Field(default=1, ge=1, le=5, description="How deep to crawl: 1=main pages only, 2=main+sub-pages, 3=main+sub+sub-sub, etc.") + country: Optional[str] = Field(default=None, description="Country code for search bias") + time_range: Optional[Literal["d", "w", "m", "y"]] = Field(default=None, description="Time range for search") + topic: Optional[Literal["general", "news", "academic"]] = Field(default=None, description="Search topic") + language: Optional[str] = Field(default=None, description="Language code") + include_answer: Optional[bool] = Field(default=None, description="Include AI answer") + include_raw_content: Optional[bool] = Field(default=None, description="Include raw content") +class WebResearchRequest(BaseModel): + """Main web research request""" + user_prompt: str = Field(min_length=1, max_length=WEB_SEARCH_MAX_QUERY_LENGTH, description="User's research question or prompt") + urls: Optional[List[str]] = Field(default=None, description="Specific URLs to crawl (optional)") + max_results: int = Field(default=5, ge=1, le=WEB_SEARCH_MAX_RESULTS, description="Max search results") + options: WebResearchOptions = Field(default_factory=WebResearchOptions, description="Advanced options") class WebSearchResultItem(BaseModel): """Individual search result""" - title: str url: HttpUrl + raw_content: Optional[str] = Field(default=None, description="Raw HTML content") +class WebCrawlResultItem(BaseModel): + """Individual crawl result""" + url: HttpUrl + content: str + +class WebResearchDocumentData(BaseModel): + """Complete web research results""" + user_prompt: str + websites_analyzed: int + additional_links_found: int + analysis_result: str + sources: List[WebSearchResultItem] + additional_links: List[str] + individual_content: Optional[Dict[str, str]] = None # URL -> content mapping + debug_info: Optional[Dict[str, Any]] = None + +class WebResearchActionDocument(ActionDocument): + documentData: WebResearchDocumentData + +class WebResearchActionResult(ActionResult): + documents: List[WebResearchActionDocument] = Field(default_factory=list) + +# Legacy models for connector compatibility class WebSearchDocumentData(BaseModel): - """Complete search (and scrape) results document""" - - query: str = Field(min_length=1, max_length=WEB_SEARCH_MAX_QUERY_LENGTH) - # Allow both WebSearchResultItem and WebScrapeResultItem to be stored here - results: List[object] + """Search results document data""" + query: str + results: List[WebSearchResultItem] total_count: int - class WebSearchActionDocument(ActionDocument): documentData: WebSearchDocumentData - class WebSearchActionResult(ActionResult): documents: List[WebSearchActionDocument] = Field(default_factory=list) - -class WebSearchBase(ABC): - @abstractmethod - async def search_urls(self, request: WebSearchRequest) -> WebSearchActionResult: ... - - -# --- Web crawl --- - - -class WebCrawlRequest(BaseModel): - urls: List[HttpUrl] - # Tavily extract options - extract_depth: Optional[Literal["basic", "advanced"]] = Field(default=None) - format: Optional[Literal["text", "markdown"]] = Field(default=None) - - -class WebCrawlResultItem(BaseModel): - """Individual crawl result""" - - url: HttpUrl - content: str - - class WebCrawlDocumentData(BaseModel): - """Complete crawl results document""" - + """Crawl results document data""" urls: List[HttpUrl] results: List[WebCrawlResultItem] total_count: int - class WebCrawlActionDocument(ActionDocument): - documentData: WebCrawlDocumentData = Field( - description="The data extracted from crawled URLs" - ) - + documentData: WebCrawlDocumentData class WebCrawlActionResult(ActionResult): documents: List[WebCrawlActionDocument] = Field(default_factory=list) - -class WebCrawlBase(ABC): - @abstractmethod - async def crawl_urls(self, request: WebCrawlRequest) -> WebCrawlActionResult: ... - - -# --- Web scrape --- - - -class WebScrapeRequest(BaseModel): - query: str = Field(min_length=1, max_length=WEB_SEARCH_MAX_QUERY_LENGTH) - max_results: int = Field(ge=WEB_SEARCH_MIN_RESULTS, le=WEB_SEARCH_MAX_RESULTS) - # Pass-through search options - search_depth: Optional[Literal["basic", "advanced"]] = Field(default=None) - time_range: Optional[Literal["d", "w", "m", "y"]] = Field(default=None) - topic: Optional[Literal["general", "news", "academic"]] = Field(default=None) - include_domains: Optional[List[str]] = Field(default=None) - exclude_domains: Optional[List[str]] = Field(default=None) - language: Optional[str] = Field(default=None) - include_answer: Optional[bool] = Field(default=None) - include_raw_content: Optional[bool] = Field(default=None) - # Extract options - extract_depth: Optional[Literal["basic", "advanced"]] = Field(default=None) - format: Optional[Literal["text", "markdown"]] = Field(default=None) - - -class WebScrapeResultItem(BaseModel): - """Individual scrape result""" - - url: HttpUrl - content: str - +class WebScrapeDocumentData(BaseModel): + """Scrape results document data""" + query: str + results: List[WebSearchResultItem] + total_count: int class WebScrapeActionDocument(ActionDocument): - documentData: WebSearchDocumentData = Field( - description="The data extracted from scraped URLs" - ) - + documentData: WebScrapeDocumentData class WebScrapeActionResult(ActionResult): documents: List[WebScrapeActionDocument] = Field(default_factory=list) +class WebSearchRequest(BaseModel): + """Search request for Tavily""" + query: str + max_results: int = 5 + search_depth: Optional[Literal["basic", "advanced"]] = None + time_range: Optional[Literal["d", "w", "m", "y"]] = None + topic: Optional[Literal["general", "news", "academic"]] = None + include_domains: Optional[List[str]] = None + exclude_domains: Optional[List[str]] = None + language: Optional[str] = None + include_answer: Optional[bool] = None + include_raw_content: Optional[bool] = None + auto_parameters: Optional[bool] = None + country: Optional[str] = None -class WebScrapeBase(ABC): - @abstractmethod - async def scrape(self, request: WebScrapeRequest) -> WebScrapeActionResult: ... +class WebCrawlRequest(BaseModel): + """Crawl request for Tavily""" + urls: List[HttpUrl] + extract_depth: Optional[Literal["basic", "advanced"]] = None + format: Optional[Literal["text", "markdown"]] = None +class WebScrapeRequest(BaseModel): + """Scrape request for Tavily""" + query: str + max_results: int = 5 + search_depth: Optional[Literal["basic", "advanced"]] = None + time_range: Optional[Literal["d", "w", "m", "y"]] = None + topic: Optional[Literal["general", "news", "academic"]] = None + include_domains: Optional[List[str]] = None + exclude_domains: Optional[List[str]] = None + language: Optional[str] = None + include_answer: Optional[bool] = None + include_raw_content: Optional[bool] = None + auto_parameters: Optional[bool] = None + country: Optional[str] = None + extract_depth: Optional[Literal["basic", "advanced"]] = None + format: Optional[Literal["text", "markdown"]] = None +class WebScrapeResultItem(BaseModel): + """Individual scrape result""" + url: HttpUrl + content: str diff --git a/modules/datamodels/datamodelWorkflow.py b/modules/datamodels/datamodelWorkflow.py deleted file mode 100644 index 686144c3..00000000 --- a/modules/datamodels/datamodelWorkflow.py +++ /dev/null @@ -1,474 +0,0 @@ -"""Workflow-related base datamodels and step/task structures.""" - -from enum import Enum -from typing import List, Dict, Any, Optional -from pydantic import BaseModel, Field -from modules.shared.attributeUtils import register_model_labels, ModelMixin - - -class ActionDocument(BaseModel, ModelMixin): - """Clear document structure for action results""" - - documentName: str = Field(description="Name of the document") - documentData: Any = Field(description="Content/data of the document") - mimeType: str = Field(description="MIME type of the document") - - -register_model_labels( - "ActionDocument", - {"en": "Action Document", "fr": "Document d'action"}, - { - "documentName": {"en": "Document Name", "fr": "Nom du document"}, - "documentData": {"en": "Document Data", "fr": "Données du document"}, - "mimeType": {"en": "MIME Type", "fr": "Type MIME"}, - }, -) - - -class ActionResult(BaseModel, ModelMixin): - """Clean action result with documents as primary output - - IMPORTANT: Action methods should NOT set resultLabel in their return value. - The resultLabel is managed by the action handler using the action's execResultLabel - from the action plan. This ensures consistent document routing throughout the workflow. - """ - - success: bool = Field(description="Whether execution succeeded") - error: Optional[str] = Field(None, description="Error message if failed") - documents: List[ActionDocument] = Field( - default_factory=list, description="Document outputs" - ) - resultLabel: Optional[str] = Field( - None, - description="Label for document routing (set by action handler, not by action methods)", - ) - - @classmethod - def isSuccess(cls, documents: List[ActionDocument] = None) -> "ActionResult": - return cls(success=True, documents=documents or []) - - @classmethod - def isFailure( - cls, error: str, documents: List[ActionDocument] = None - ) -> "ActionResult": - return cls(success=False, documents=documents or [], error=error) - - -register_model_labels( - "ActionResult", - {"en": "Action Result", "fr": "Résultat de l'action"}, - { - "success": {"en": "Success", "fr": "Succès"}, - "error": {"en": "Error", "fr": "Erreur"}, - "documents": {"en": "Documents", "fr": "Documents"}, - "resultLabel": {"en": "Result Label", "fr": "Étiquette du résultat"}, - }, -) - - -class ActionSelection(BaseModel, ModelMixin): - method: str = Field(description="Method to execute (e.g., web, document, ai)") - name: str = Field( - description="Action name within the method (e.g., search, extract)" - ) - - -register_model_labels( - "ActionSelection", - {"en": "Action Selection", "fr": "Sélection d'action"}, - { - "method": {"en": "Method", "fr": "Méthode"}, - "name": {"en": "Action Name", "fr": "Nom de l'action"}, - }, -) - - -class ActionParameters(BaseModel, ModelMixin): - parameters: Dict[str, Any] = Field( - default_factory=dict, description="Parameters to execute the selected action" - ) - - -register_model_labels( - "ActionParameters", - {"en": "Action Parameters", "fr": "Paramètres d'action"}, - { - "parameters": {"en": "Parameters", "fr": "Paramètres"}, - }, -) - - -class ObservationPreview(BaseModel, ModelMixin): - name: str = Field(description="Document name or URL label") - mime: str = Field(description="MIME type or kind") - snippet: str = Field(description="Short snippet or summary") - - -register_model_labels( - "ObservationPreview", - {"en": "Observation Preview", "fr": "Aperçu d'observation"}, - { - "name": {"en": "Name", "fr": "Nom"}, - "mime": {"en": "MIME", "fr": "MIME"}, - "snippet": {"en": "Snippet", "fr": "Extrait"}, - }, -) - - -class Observation(BaseModel, ModelMixin): - success: bool = Field(description="Action execution success flag") - resultLabel: str = Field(description="Deterministic label for produced documents") - documentsCount: int = Field(description="Number of produced documents") - previews: List[ObservationPreview] = Field( - default_factory=list, description="Compact previews of outputs" - ) - notes: List[str] = Field( - default_factory=list, description="Short notes or key facts" - ) - - -register_model_labels( - "Observation", - {"en": "Observation", "fr": "Observation"}, - { - "success": {"en": "Success", "fr": "Succès"}, - "resultLabel": {"en": "Result Label", "fr": "Étiquette du résultat"}, - "documentsCount": {"en": "Documents Count", "fr": "Nombre de documents"}, - "previews": {"en": "Previews", "fr": "Aperçus"}, - "notes": {"en": "Notes", "fr": "Notes"}, - }, -) - - -class TaskStatus(str, Enum): - """Task status enumeration.""" - - PENDING = "pending" - RUNNING = "running" - COMPLETED = "completed" - FAILED = "failed" - CANCELLED = "cancelled" - - -register_model_labels( - "TaskStatus", - {"en": "Task Status", "fr": "Statut de la tâche"}, - { - "PENDING": {"en": "Pending", "fr": "En attente"}, - "RUNNING": {"en": "Running", "fr": "En cours"}, - "COMPLETED": {"en": "Completed", "fr": "Terminé"}, - "FAILED": {"en": "Failed", "fr": "Échec"}, - "CANCELLED": {"en": "Cancelled", "fr": "Annulé"}, - }, -) - - -class DocumentExchange(BaseModel, ModelMixin): - documentsLabel: str = Field(description="Label for the set of documents") - documents: List[str] = Field( - default_factory=list, description="List of document references" - ) - - -register_model_labels( - "DocumentExchange", - {"en": "Document Exchange", "fr": "Échange de documents"}, - { - "documentsLabel": {"en": "Documents Label", "fr": "Label des documents"}, - "documents": {"en": "Documents", "fr": "Documents"}, - }, -) - - -class TaskAction(BaseModel, ModelMixin): - id: str = Field(..., description="Action ID") - execMethod: str = Field(..., description="Method to execute") - execAction: str = Field(..., description="Action to perform") - execParameters: Dict[str, Any] = Field( - default_factory=dict, description="Action parameters" - ) - execResultLabel: Optional[str] = Field( - None, description="Label for the set of result documents" - ) - expectedDocumentFormats: Optional[List[Dict[str, str]]] = Field( - None, description="Expected document formats (optional)" - ) - userMessage: Optional[str] = Field( - None, description="User-friendly message in user's language" - ) - status: TaskStatus = Field(default=TaskStatus.PENDING, description="Action status") - error: Optional[str] = Field(None, description="Error message if action failed") - retryCount: int = Field(default=0, description="Number of retries attempted") - retryMax: int = Field(default=3, description="Maximum number of retries") - processingTime: Optional[float] = Field( - None, description="Processing time in seconds" - ) - timestamp: float = Field( - ..., description="When the action was executed (UTC timestamp in seconds)" - ) - result: Optional[str] = Field(None, description="Result of the action") - - -register_model_labels( - "TaskAction", - {"en": "Task Action", "fr": "Action de tâche"}, - { - "id": {"en": "Action ID", "fr": "ID de l'action"}, - "execMethod": {"en": "Method", "fr": "Méthode"}, - "execAction": {"en": "Action", "fr": "Action"}, - "execParameters": {"en": "Parameters", "fr": "Paramètres"}, - "execResultLabel": {"en": "Result Label", "fr": "Label du résultat"}, - "expectedDocumentFormats": { - "en": "Expected Document Formats", - "fr": "Formats de documents attendus", - }, - "userMessage": {"en": "User Message", "fr": "Message utilisateur"}, - "status": {"en": "Status", "fr": "Statut"}, - "error": {"en": "Error", "fr": "Erreur"}, - "retryCount": {"en": "Retry Count", "fr": "Nombre de tentatives"}, - "retryMax": {"en": "Max Retries", "fr": "Tentatives max"}, - "processingTime": {"en": "Processing Time", "fr": "Temps de traitement"}, - "timestamp": {"en": "Timestamp", "fr": "Horodatage"}, - "result": {"en": "Result", "fr": "Résultat"}, - }, -) - - -class TaskResult(BaseModel, ModelMixin): - taskId: str = Field(..., description="Task ID") - status: TaskStatus = Field(default=TaskStatus.PENDING, description="Task status") - success: bool = Field(..., description="Whether the task was successful") - feedback: Optional[str] = Field(None, description="Task feedback message") - error: Optional[str] = Field(None, description="Error message if task failed") - - -register_model_labels( - "TaskResult", - {"en": "Task Result", "fr": "Résultat de tâche"}, - { - "taskId": {"en": "Task ID", "fr": "ID de la tâche"}, - "status": {"en": "Status", "fr": "Statut"}, - "success": {"en": "Success", "fr": "Succès"}, - "feedback": {"en": "Feedback", "fr": "Retour"}, - "error": {"en": "Error", "fr": "Erreur"}, - }, -) - - -class TaskItem(BaseModel, ModelMixin): - id: str = Field(..., description="Task ID") - workflowId: str = Field(..., description="Workflow ID") - userInput: str = Field(..., description="User input that triggered the task") - status: TaskStatus = Field(default=TaskStatus.PENDING, description="Task status") - error: Optional[str] = Field(None, description="Error message if task failed") - startedAt: Optional[float] = Field( - None, description="When the task started (UTC timestamp in seconds)" - ) - finishedAt: Optional[float] = Field( - None, description="When the task finished (UTC timestamp in seconds)" - ) - actionList: List[TaskAction] = Field( - default_factory=list, description="List of actions to execute" - ) - retryCount: int = Field(default=0, description="Number of retries attempted") - retryMax: int = Field(default=3, description="Maximum number of retries") - rollbackOnFailure: bool = Field( - default=True, description="Whether to rollback on failure" - ) - dependencies: List[str] = Field( - default_factory=list, description="List of task IDs this task depends on" - ) - feedback: Optional[str] = Field(None, description="Task feedback message") - processingTime: Optional[float] = Field( - None, description="Total processing time in seconds" - ) - resultLabels: Optional[Dict[str, Any]] = Field( - default_factory=dict, description="Map of result labels to their values" - ) - - -register_model_labels( - "TaskItem", - {"en": "Task", "fr": "Tâche"}, - { - "id": {"en": "Task ID", "fr": "ID de la tâche"}, - "workflowId": {"en": "Workflow ID", "fr": "ID du workflow"}, - "userInput": {"en": "User Input", "fr": "Entrée utilisateur"}, - "status": {"en": "Status", "fr": "Statut"}, - "error": {"en": "Error", "fr": "Erreur"}, - "startedAt": {"en": "Started At", "fr": "Démarré à"}, - "finishedAt": {"en": "Finished At", "fr": "Terminé à"}, - "actionList": {"en": "Actions", "fr": "Actions"}, - "retryCount": {"en": "Retry Count", "fr": "Nombre de tentatives"}, - "retryMax": {"en": "Max Retries", "fr": "Tentatives max"}, - "processingTime": {"en": "Processing Time", "fr": "Temps de traitement"}, - }, -) - - -class TaskStep(BaseModel, ModelMixin): - id: str - objective: str - dependencies: Optional[list[str]] = Field(default_factory=list) - success_criteria: Optional[list[str]] = Field(default_factory=list) - estimated_complexity: Optional[str] = None - userMessage: Optional[str] = Field( - None, description="User-friendly message in user's language" - ) - - -register_model_labels( - "TaskStep", - {"en": "Task Step", "fr": "Étape de tâche"}, - { - "id": {"en": "ID", "fr": "ID"}, - "objective": {"en": "Objective", "fr": "Objectif"}, - "dependencies": {"en": "Dependencies", "fr": "Dépendances"}, - "success_criteria": {"en": "Success Criteria", "fr": "Critères de succès"}, - "estimated_complexity": { - "en": "Estimated Complexity", - "fr": "Complexité estimée", - }, - "userMessage": {"en": "User Message", "fr": "Message utilisateur"}, - }, -) - - -class TaskHandover(BaseModel, ModelMixin): - taskId: str = Field(description="Target task ID") - sourceTask: Optional[str] = Field(None, description="Source task ID") - inputDocuments: List[DocumentExchange] = Field( - default_factory=list, description="Available input documents" - ) - outputDocuments: List[DocumentExchange] = Field( - default_factory=list, description="Produced output documents" - ) - context: Dict[str, Any] = Field(default_factory=dict, description="Task context") - previousResults: List[str] = Field( - default_factory=list, description="Previous result summaries" - ) - improvements: List[str] = Field( - default_factory=list, description="Improvement suggestions" - ) - workflowSummary: Optional[str] = Field( - None, description="Summarized workflow context" - ) - messageHistory: List[str] = Field( - default_factory=list, description="Key message summaries" - ) - timestamp: float = Field( - ..., description="When the handover was created (UTC timestamp in seconds)" - ) - handoverType: str = Field( - default="task", description="Type of handover: task, phase, or workflow" - ) - - -register_model_labels( - "TaskHandover", - {"en": "Task Handover", "fr": "Transfert de tâche"}, - { - "taskId": {"en": "Task ID", "fr": "ID de la tâche"}, - "sourceTask": {"en": "Source Task", "fr": "Tâche source"}, - "inputDocuments": {"en": "Input Documents", "fr": "Documents d'entrée"}, - "outputDocuments": {"en": "Output Documents", "fr": "Documents de sortie"}, - "context": {"en": "Context", "fr": "Contexte"}, - "previousResults": {"en": "Previous Results", "fr": "Résultats précédents"}, - "improvements": {"en": "Improvements", "fr": "Améliorations"}, - "workflowSummary": {"en": "Workflow Summary", "fr": "Résumé du workflow"}, - "messageHistory": {"en": "Message History", "fr": "Historique des messages"}, - "timestamp": {"en": "Timestamp", "fr": "Horodatage"}, - "handoverType": {"en": "Handover Type", "fr": "Type de transfert"}, - }, -) - - -class TaskContext(BaseModel, ModelMixin): - task_step: TaskStep - workflow: Optional["ChatWorkflow"] = None - workflow_id: Optional[str] = None - available_documents: Optional[str] = "No documents available" - available_connections: Optional[list[str]] = Field(default_factory=list) - previous_results: Optional[list[str]] = Field(default_factory=list) - previous_handover: Optional[TaskHandover] = None - improvements: Optional[list[str]] = Field(default_factory=list) - retry_count: Optional[int] = 0 - previous_action_results: Optional[list] = Field(default_factory=list) - previous_review_result: Optional[dict] = None - is_regeneration: Optional[bool] = False - failure_patterns: Optional[list[str]] = Field(default_factory=list) - failed_actions: Optional[list] = Field(default_factory=list) - successful_actions: Optional[list] = Field(default_factory=list) - criteria_progress: Optional[dict] = None - - def getDocumentReferences(self) -> List[str]: - docs = [] - if self.previous_handover: - for doc_exchange in self.previous_handover.inputDocuments: - docs.extend(doc_exchange.documents) - return list(set(docs)) - - def addImprovement(self, improvement: str) -> None: - if improvement not in (self.improvements or []): - if self.improvements is None: - self.improvements = [] - self.improvements.append(improvement) - - -class ReviewContext(BaseModel, ModelMixin): - task_step: TaskStep - task_actions: Optional[list] = Field(default_factory=list) - action_results: Optional[list] = Field(default_factory=list) - step_result: Optional[dict] = Field(default_factory=dict) - workflow_id: Optional[str] = None - previous_results: Optional[list[str]] = Field(default_factory=list) - - -class ReviewResult(BaseModel, ModelMixin): - status: str - reason: Optional[str] = None - improvements: Optional[list[str]] = Field(default_factory=list) - quality_score: Optional[int] = 5 - missing_outputs: Optional[list[str]] = Field(default_factory=list) - met_criteria: Optional[list[str]] = Field(default_factory=list) - unmet_criteria: Optional[list[str]] = Field(default_factory=list) - confidence: Optional[float] = 0.5 - userMessage: Optional[str] = Field( - None, description="User-friendly message in user's language" - ) - - -register_model_labels( - "ReviewResult", - {"en": "Review Result", "fr": "Résultat de l'évaluation"}, - { - "status": {"en": "Status", "fr": "Statut"}, - "reason": {"en": "Reason", "fr": "Raison"}, - "improvements": {"en": "Improvements", "fr": "Améliorations"}, - "quality_score": {"en": "Quality Score", "fr": "Score de qualité"}, - "missing_outputs": {"en": "Missing Outputs", "fr": "Sorties manquantes"}, - "met_criteria": {"en": "Met Criteria", "fr": "Critères respectés"}, - "unmet_criteria": {"en": "Unmet Criteria", "fr": "Critères non respectés"}, - "confidence": {"en": "Confidence", "fr": "Confiance"}, - "userMessage": {"en": "User Message", "fr": "Message utilisateur"}, - }, -) - - -class TaskPlan(BaseModel, ModelMixin): - overview: str - tasks: list[TaskStep] - userMessage: Optional[str] = Field( - None, description="Overall user-friendly message for the task plan" - ) - - -register_model_labels( - "TaskPlan", - {"en": "Task Plan", "fr": "Plan de tâches"}, - { - "overview": {"en": "Overview", "fr": "Aperçu"}, - "tasks": {"en": "Tasks", "fr": "Tâches"}, - "userMessage": {"en": "User Message", "fr": "Message utilisateur"}, - }, -) diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py index 60f803cc..3e9c744d 100644 --- a/modules/interfaces/interfaceAiObjects.py +++ b/modules/interfaces/interfaceAiObjects.py @@ -1,10 +1,12 @@ import logging -from typing import Dict, Any, List, Union +from typing import Dict, Any, List, Union, Tuple, Optional from dataclasses import dataclass +logger = logging.getLogger(__name__) + from modules.connectors.connectorAiOpenai import AiOpenai from modules.connectors.connectorAiAnthropic import AiAnthropic -from modules.connectors.connectorAiLangdoc import AiLangdoc +from modules.connectors.connectorAiPerplexity import AiPerplexity from modules.connectors.connectorAiTavily import ConnectorWeb from modules.datamodels.datamodelAi import ( AiCallOptions, @@ -18,26 +20,14 @@ from modules.datamodels.datamodelAi import ( PROCESSING_MODE_PRIORITY_MAPPING ) from modules.datamodels.datamodelWeb import ( - WebCrawlActionResult, - WebCrawlActionDocument, - WebCrawlDocumentData, - WebCrawlRequest, - WebCrawlResultItem, - WebScrapeActionResult, - WebScrapeActionDocument, - WebSearchDocumentData as WebScrapeDocumentData, - WebScrapeRequest, - WebScrapeResultItem, - WebSearchActionResult, - WebSearchActionDocument, - WebSearchDocumentData, - WebSearchRequest, + WebResearchRequest, + WebResearchActionResult, WebSearchResultItem, + WebCrawlResultItem, + WebSearchRequest, + WebCrawlRequest, ) -from modules.datamodels.datamodelWorkflow import ActionDocument - - -logger = logging.getLogger(__name__) +from modules.datamodels.datamodelChat import ActionDocument # Comprehensive model registry with capability tags and function mapping @@ -52,8 +42,8 @@ aiModels: Dict[str, Dict[str, Any]] = { "costPer1kTokensOutput": 0.06, "speedRating": 8, "qualityRating": 9, - "capabilities": ["text_generation", "chat", "reasoning"], - "tags": ["text", "chat", "reasoning", "general"] + "capabilities": ["text_generation", "chat", "reasoning", "analysis"], + "tags": ["text", "chat", "reasoning", "analysis", "general"] }, "openai_callAiBasic_gpt35": { "connector": "openai", @@ -118,90 +108,66 @@ aiModels: Dict[str, Dict[str, Any]] = { "tags": ["image", "vision", "multimodal", "high_quality"] }, - # LangDoc Models - "langdoc_callAiBasic": { - "connector": "langdoc", + # Perplexity Models + "perplexity_callAiBasic": { + "connector": "perplexity", "function": "callAiBasic", - "llmName": "gpt-4o", + "llmName": "llama-3.1-sonar-large-128k-online", "contextLength": 128000, - "costPer1kTokens": 0.02, - "costPer1kTokensOutput": 0.04, + "costPer1kTokens": 0.005, + "costPer1kTokensOutput": 0.005, "speedRating": 8, - "qualityRating": 9, - "capabilities": ["text_generation", "chat", "reasoning"], - "tags": ["text", "chat", "reasoning", "general", "cost_effective"] + "qualityRating": 8, + "capabilities": ["text_generation", "chat", "reasoning", "web_search"], + "tags": ["text", "chat", "reasoning", "web_search", "cost_effective"] }, - "langdoc_callAiImage": { - "connector": "langdoc", - "function": "callAiImage", - "llmName": "gpt-4o", + "perplexity_callAiWithWebSearch": { + "connector": "perplexity", + "function": "callAiWithWebSearch", + "llmName": "sonar-pro", "contextLength": 128000, - "costPer1kTokens": 0.02, - "costPer1kTokensOutput": 0.04, + "costPer1kTokens": 0.01, + "costPer1kTokensOutput": 0.01, "speedRating": 7, "qualityRating": 9, - "capabilities": ["image_analysis", "vision", "multimodal"], - "tags": ["image", "vision", "multimodal", "cost_effective"] + "capabilities": ["text_generation", "web_search", "research"], + "tags": ["text", "web_search", "research", "high_quality"] }, - "langdoc_generateImage": { - "connector": "langdoc", - "function": "generateImage", - "llmName": "dall-e-3", - "contextLength": 0, - "costPer1kTokens": 0.04, - "costPer1kTokensOutput": 0.0, - "speedRating": 6, - "qualityRating": 9, - "capabilities": ["image_generation", "art", "visual_creation"], - "tags": ["image_generation", "art", "visual", "cost_effective"] - }, - "langdoc_generateImageWithVariations": { - "connector": "langdoc", - "function": "generateImageWithVariations", - "llmName": "dall-e-3", - "contextLength": 0, - "costPer1kTokens": 0.04, - "costPer1kTokensOutput": 0.0, - "speedRating": 5, - "qualityRating": 9, - "capabilities": ["image_generation", "art", "visual_creation", "variations"], - "tags": ["image_generation", "art", "visual", "variations", "cost_effective"] - }, - "langdoc_generateImageWithChat": { - "connector": "langdoc", - "function": "generateImageWithChat", - "llmName": "gpt-4o", - "contextLength": 128000, - "costPer1kTokens": 0.02, - "costPer1kTokensOutput": 0.04, - "speedRating": 6, + "perplexity_researchTopic": { + "connector": "perplexity", + "function": "researchTopic", + "llmName": "mistral-7b-instruct", + "contextLength": 32000, + "costPer1kTokens": 0.002, + "costPer1kTokensOutput": 0.002, + "speedRating": 8, "qualityRating": 8, - "capabilities": ["image_generation", "chat", "visual_creation"], - "tags": ["image_generation", "chat", "visual", "cost_effective"] + "capabilities": ["web_search", "research", "information_gathering"], + "tags": ["web_search", "research", "information", "cost_effective"] }, - "langdoc_listModels": { - "connector": "langdoc", - "function": "listModels", - "llmName": "api", - "contextLength": 0, - "costPer1kTokens": 0.0, - "costPer1kTokensOutput": 0.0, - "speedRating": 9, - "qualityRating": 5, - "capabilities": ["model_listing", "api_info"], - "tags": ["api", "info", "models"] + "perplexity_answerQuestion": { + "connector": "perplexity", + "function": "answerQuestion", + "llmName": "mistral-7b-instruct", + "contextLength": 32000, + "costPer1kTokens": 0.002, + "costPer1kTokensOutput": 0.002, + "speedRating": 8, + "qualityRating": 8, + "capabilities": ["web_search", "question_answering", "research"], + "tags": ["web_search", "qa", "research", "cost_effective"] }, - "langdoc_getModelInfo": { - "connector": "langdoc", - "function": "getModelInfo", - "llmName": "api", - "contextLength": 0, - "costPer1kTokens": 0.0, - "costPer1kTokensOutput": 0.0, - "speedRating": 9, - "qualityRating": 5, - "capabilities": ["model_info", "api_info"], - "tags": ["api", "info", "models"] + "perplexity_getCurrentNews": { + "connector": "perplexity", + "function": "getCurrentNews", + "llmName": "mistral-7b-instruct", + "contextLength": 32000, + "costPer1kTokens": 0.002, + "costPer1kTokensOutput": 0.002, + "speedRating": 8, + "qualityRating": 8, + "capabilities": ["web_search", "news", "current_events"], + "tags": ["web_search", "news", "current_events", "cost_effective"] }, # Tavily Web Models @@ -250,7 +216,7 @@ class AiObjects: openaiService: AiOpenai anthropicService: AiAnthropic - langdocService: AiLangdoc + perplexityService: AiPerplexity tavilyService: ConnectorWeb def __post_init__(self) -> None: @@ -258,8 +224,8 @@ class AiObjects: raise TypeError("openaiService must be provided") if self.anthropicService is None: raise TypeError("anthropicService must be provided") - if self.langdocService is None: - raise TypeError("langdocService must be provided") + if self.perplexityService is None: + raise TypeError("perplexityService must be provided") if self.tavilyService is None: raise TypeError("tavilyService must be provided") @@ -268,13 +234,13 @@ class AiObjects: """Create AiObjects instance with all connectors initialized.""" openaiService = AiOpenai() anthropicService = AiAnthropic() - langdocService = AiLangdoc() + perplexityService = AiPerplexity() tavilyService = await ConnectorWeb.create() return cls( openaiService=openaiService, anthropicService=anthropicService, - langdocService=langdocService, + perplexityService=perplexityService, tavilyService=tavilyService ) @@ -330,11 +296,22 @@ class AiObjects: elif options.operationType == OperationType.IMAGE_GENERATION: return "openai_generateImage" elif options.operationType == OperationType.WEB_RESEARCH: - return "langdoc_callAiBasic" + return "perplexity_callAiWithWebSearch" else: return "openai_callAiBasic_gpt35" - # Select based on priority + # Special handling for planning operations - use Claude for consistency + if options.operationType in [OperationType.GENERATE_PLAN, OperationType.ANALYSE_CONTENT]: + if "anthropic_callAiBasic" in candidates: + logger.info("Planning operation: Selected Claude (anthropic_callAiBasic) for highest quality") + return "anthropic_callAiBasic" + + # Fallback to GPT-4o if Claude not available + if "openai_callAiBasic" in candidates: + logger.info("Planning operation: Selected GPT-4o (openai_callAiBasic) as fallback") + return "openai_callAiBasic" + + # Select based on priority for other operations if effectivePriority == Priority.SPEED: return max(candidates, key=lambda k: candidates[k]["speedRating"]) elif effectivePriority == Priority.QUALITY: @@ -355,8 +332,8 @@ class AiObjects: return self.openaiService elif connectorType == "anthropic": return self.anthropicService - elif connectorType == "langdoc": - return self.langdocService + elif connectorType == "perplexity": + return self.perplexityService elif connectorType == "tavily": return self.tavilyService else: @@ -383,6 +360,17 @@ class AiObjects: # Select model for text generation modelName = self._selectModel(prompt, context, options) + # Derive generation parameters + temperature = getattr(options, "temperature", None) + if temperature is None: + temperature = 0.2 + maxTokens = getattr(options, "maxTokens", None) + # Provide a generous default to avoid truncation for long outputs + if maxTokens is None: + # If resultFormat suggests large outputs (e.g., html, json), allow more tokens + wants_large = str(getattr(options, "resultFormat", "")).lower() in ["html", "json", "md", "markdown"] + maxTokens = 8000 if wants_large else 2000 + messages: List[Dict[str, Any]] = [] if context: messages.append({"role": "system", "content": f"Context from documents:\n{context}"}) @@ -394,10 +382,27 @@ class AiObjects: # Call the appropriate function if functionName == "callAiBasic": if aiModels[modelName]["connector"] == "openai": - content = await connector.callAiBasic(messages) + content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens) + elif aiModels[modelName]["connector"] == "perplexity": + content = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens) else: - response = await connector.callAiBasic(messages) + response = await connector.callAiBasic(messages, temperature=temperature, maxTokens=maxTokens) content = response["choices"][0]["message"]["content"] + elif functionName == "callAiWithWebSearch": + # Perplexity web search function + query = prompt + if context: + query = f"Context: {context}\n\nQuery: {prompt}" + content = await connector.callAiWithWebSearch(query) + elif functionName == "researchTopic": + # Perplexity research function + content = await connector.researchTopic(prompt) + elif functionName == "answerQuestion": + # Perplexity question answering function + content = await connector.answerQuestion(prompt, context) + elif functionName == "getCurrentNews": + # Perplexity news function + content = await connector.getCurrentNews(prompt) else: raise ValueError(f"Function {functionName} not supported for text generation") @@ -446,21 +451,331 @@ class AiObjects: else: raise ValueError(f"Function {functionName} not supported for image generation") - # Web functionality methods - async def webSearch(self, web_search_request: WebSearchRequest) -> WebSearchActionResult: - """Perform web search using Tavily.""" - return await self.tavilyService.search(web_search_request) + # Web functionality methods - Simple interface to Tavily connector + async def search_websites(self, query: str, max_results: int = 5, **kwargs) -> List[WebSearchResultItem]: + """Search for websites using Tavily.""" + request = WebSearchRequest( + query=query, + max_results=max_results, + **kwargs + ) + result = await self.tavilyService.search(request) + + if result.success and result.documents: + return result.documents[0].documentData.results + return [] - async def webCrawl(self, web_crawl_request: WebCrawlRequest) -> WebCrawlActionResult: - """Crawl web pages using Tavily.""" - return await self.tavilyService.crawl(web_crawl_request) + async def crawl_websites(self, urls: List[str], extract_depth: str = "advanced", format: str = "markdown") -> List[WebCrawlResultItem]: + """Crawl websites using Tavily.""" + from pydantic import HttpUrl + from urllib.parse import urlparse + + # Safely create HttpUrl objects with proper scheme handling + http_urls = [] + for url in urls: + try: + # Ensure URL has a scheme + parsed = urlparse(url) + if not parsed.scheme: + url = f"https://{url}" + + # Use HttpUrl with scheme parameter (this works for all URLs) + http_urls.append(HttpUrl(url, scheme="https")) + + except Exception as e: + logger.warning(f"Skipping invalid URL {url}: {e}") + continue + + if not http_urls: + return [] + + request = WebCrawlRequest( + urls=http_urls, + extract_depth=extract_depth, + format=format + ) + result = await self.tavilyService.crawl(request) + + if result.success and result.documents: + return result.documents[0].documentData.results + return [] - async def webScrape(self, web_scrape_request: WebScrapeRequest) -> WebScrapeActionResult: - """Scrape web content using Tavily.""" - return await self.tavilyService.scrape(web_scrape_request) + async def extract_content(self, urls: List[str], extract_depth: str = "advanced", format: str = "markdown") -> Dict[str, str]: + """Extract content from URLs and return as dictionary.""" + crawl_results = await self.crawl_websites(urls, extract_depth, format) + return {str(result.url): result.content for result in crawl_results} + + # Core Web Tools - Clean interface for web operations + async def readPage(self, url: str, extract_depth: str = "advanced") -> Optional[str]: + """Read a single web page and return its content (HTML/Markdown).""" + logger.debug(f"Reading page: {url}") + try: + # URL encode the URL to handle spaces and special characters + from urllib.parse import quote, urlparse, urlunparse + parsed = urlparse(url) + encoded_url = urlunparse(( + parsed.scheme, + parsed.netloc, + parsed.path, + parsed.params, + parsed.query, + parsed.fragment + )) + + # Manually encode query parameters to handle spaces + if parsed.query: + encoded_query = quote(parsed.query, safe='=&') + encoded_url = urlunparse(( + parsed.scheme, + parsed.netloc, + parsed.path, + parsed.params, + encoded_query, + parsed.fragment + )) + + logger.debug(f"URL encoded: {url} -> {encoded_url}") + + content = await self.extract_content([encoded_url], extract_depth, "markdown") + result = content.get(encoded_url) + if result: + logger.debug(f"Successfully read page {encoded_url}: {len(result)} chars") + else: + logger.warning(f"No content returned for page {encoded_url}") + return result + except Exception as e: + logger.warning(f"Failed to read page {url}: {e}") + return None + + async def getUrlsFromPage(self, url: str, extract_depth: str = "advanced") -> List[str]: + """Get all URLs from a web page, with redundancies removed.""" + try: + content = await self.readPage(url, extract_depth) + if not content: + return [] + + links = self._extractLinksFromContent(content, url) + # Remove duplicates while preserving order + seen = set() + unique_links = [] + for link in links: + if link not in seen: + seen.add(link) + unique_links.append(link) + + logger.debug(f"Extracted {len(unique_links)} unique URLs from {url}") + return unique_links + + except Exception as e: + logger.warning(f"Failed to get URLs from page {url}: {e}") + return [] + + def filterUrlsOnlyPages(self, urls: List[str], max_per_domain: int = 10) -> List[str]: + """Filter URLs to get only links for pages to follow (no images, etc.).""" + from urllib.parse import urlparse + + def _isHtmlCandidate(url: str) -> bool: + lower = url.lower() + blocked = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', + '.mp4', '.mp3', '.avi', '.mov', '.mkv', + '.pdf', '.zip', '.rar', '.7z', '.tar', '.gz', + '.css', '.js', '.woff', '.woff2', '.ttf', '.eot') + return not lower.endswith(blocked) + + # Group by domain + domain_links = {} + for link in urls: + domain = urlparse(link).netloc + if domain not in domain_links: + domain_links[domain] = [] + domain_links[domain].append(link) + + # Filter and cap per domain + filtered_links = [] + for domain, domain_link_list in domain_links.items(): + seen = set() + domain_filtered = [] + + for link in domain_link_list: + if link in seen: + continue + if not _isHtmlCandidate(link): + continue + seen.add(link) + domain_filtered.append(link) + if len(domain_filtered) >= max_per_domain: + break + + filtered_links.extend(domain_filtered) + logger.debug(f"Domain {domain}: {len(domain_link_list)} -> {len(domain_filtered)} links") + + return filtered_links + + def _extractLinksFromContent(self, content: str, base_url: str) -> List[str]: + """Extract links from HTML/Markdown content.""" + try: + import re + from urllib.parse import urljoin, urlparse, quote, urlunparse + + def _cleanUrl(url: str) -> str: + """Clean and encode URL to remove spaces and invalid characters.""" + # Remove quotes and extra spaces + url = url.strip().strip('"\'') + + # If it's a relative URL, make it absolute first + if not url.startswith(('http://', 'https://')): + url = urljoin(base_url, url) + + # Parse and re-encode the URL properly + parsed = urlparse(url) + if parsed.query: + # Encode query parameters properly + encoded_query = quote(parsed.query, safe='=&') + url = urlunparse(( + parsed.scheme, + parsed.netloc, + parsed.path, + parsed.params, + encoded_query, + parsed.fragment + )) + + return url + + links = [] + + # Extract HTML links: format + html_link_pattern = r']+href=["\']([^"\']+)["\'][^>]*>' + html_links = re.findall(html_link_pattern, content, re.IGNORECASE) + + for url in html_links: + if url and not url.startswith('#') and not url.startswith('javascript:'): + try: + cleaned_url = _cleanUrl(url) + links.append(cleaned_url) + logger.debug(f"Extracted HTML link: {url} -> {cleaned_url}") + except Exception as e: + logger.debug(f"Failed to clean HTML link {url}: {e}") + + # Extract markdown links: [text](url) format + markdown_link_pattern = r'\[([^\]]+)\]\(([^)]+)\)' + markdown_links = re.findall(markdown_link_pattern, content) + + for text, url in markdown_links: + if url and not url.startswith('#'): + try: + cleaned_url = _cleanUrl(url) + # Only keep URLs from the same domain + if urlparse(cleaned_url).netloc == urlparse(base_url).netloc: + links.append(cleaned_url) + logger.debug(f"Extracted markdown link: {url} -> {cleaned_url}") + except Exception as e: + logger.debug(f"Failed to clean markdown link {url}: {e}") + + # Extract plain URLs in the text + url_pattern = r'https?://[^\s\)]+' + plain_urls = re.findall(url_pattern, content) + + for url in plain_urls: + try: + clean_url = url.rstrip('.,;!?') + cleaned_url = _cleanUrl(clean_url) + if urlparse(cleaned_url).netloc == urlparse(base_url).netloc: + if cleaned_url not in links: # Avoid duplicates + links.append(cleaned_url) + logger.debug(f"Extracted plain URL: {url} -> {cleaned_url}") + except Exception as e: + logger.debug(f"Failed to clean plain URL {url}: {e}") + + logger.debug(f"Total links extracted and cleaned: {len(links)}") + return links + + except Exception as e: + logger.warning(f"Failed to extract links from content: {e}") + return [] + + async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10) -> Dict[str, str]: + """ + Recursively crawl URLs up to specified depth. + + Args: + urls: List of starting URLs to crawl + max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.) + extract_depth: Tavily extract depth setting + max_per_domain: Maximum URLs per domain per level + + Returns: + Dictionary mapping URL -> content for all crawled pages + """ + logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}") + + # URL index to track all processed URLs + processed_urls = set() + all_content = {} + + # Current level URLs to process + current_level_urls = urls.copy() + + for depth in range(1, max_depth + 1): + logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===") + logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}") + + # URLs found at this level (for next iteration) + next_level_urls = [] + + for url in current_level_urls: + if url in processed_urls: + logger.debug(f"URL {url} already processed, skipping") + continue + + try: + logger.info(f"Processing URL at depth {depth}: {url}") + + # Read page content + content = await self.readPage(url, extract_depth) + if content: + all_content[url] = content + processed_urls.add(url) + logger.info(f"✓ Successfully processed {url}: {len(content)} chars") + + # Get URLs from this page for next level + page_urls = await self.getUrlsFromPage(url, extract_depth) + logger.info(f"Found {len(page_urls)} URLs on {url}") + + # Filter URLs and add to next level + filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain) + logger.info(f"Filtered to {len(filtered_urls)} valid URLs") + + # Add new URLs to next level (avoiding already processed ones) + new_urls_count = 0 + for new_url in filtered_urls: + if new_url not in processed_urls: + next_level_urls.append(new_url) + new_urls_count += 1 + + logger.info(f"Added {new_urls_count} new URLs to next level from {url}") + else: + logger.warning(f"✗ No content extracted from {url}") + processed_urls.add(url) # Mark as processed to avoid retry + + except Exception as e: + logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}") + processed_urls.add(url) # Mark as processed to avoid retry + + # Prepare for next iteration + current_level_urls = next_level_urls + logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level") + + # Stop if no more URLs to process + if not current_level_urls: + logger.info(f"No more URLs found at depth {depth}, stopping recursion") + break + + logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled") + return all_content async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> str: - """Use LangDoc AI to provide the best answers for web-related queries.""" + """Use Perplexity AI to provide the best answers for web-related queries.""" if options is None: options = AiCallOptions(operationType=OperationType.WEB_RESEARCH) @@ -480,14 +795,12 @@ Please provide: Format your response in a clear, professional manner that would be helpful for someone researching this topic.""" - messages = [{"role": "user", "content": webPrompt}] - try: - # Use LangDoc for the best answers - response = await self.langdocService.callAiBasic(messages) + # Use Perplexity for web research with search capabilities + response = await self.perplexityService.callAiWithWebSearch(webPrompt) return response except Exception as e: - logger.error(f"LangDoc web query failed: {str(e)}") + logger.error(f"Perplexity web query failed: {str(e)}") raise Exception(f"Failed to process web query: {str(e)}") # Utility methods @@ -511,3 +824,157 @@ Format your response in a clear, professional manner that would be helpful for s """Get model names that have a specific tag.""" return [name for name, info in aiModels.items() if tag in info.get("tags", [])] + async def selectRelevantWebsites(self, websites: List[str], userQuestion: str) -> Tuple[List[str], str]: + """Select most relevant websites using AI analysis. Returns (selected_websites, ai_response).""" + if len(websites) <= 1: + return websites, "Only one website available, no selection needed" + + try: + # Create website summaries for AI analysis + websiteSummaries = [] + for i, url in enumerate(websites, 1): + from urllib.parse import urlparse + domain = urlparse(url).netloc + summary = f"{i}. {url} (Domain: {domain})" + websiteSummaries.append(summary) + + selectionPrompt = f""" + Based on this user request: "{userQuestion}" + + I have {len(websites)} websites found. Please select the most relevant website(s) for this request. + + Available websites: + {chr(10).join(websiteSummaries)} + + Please respond with the website number(s) (1, 2, 3, etc.) that are most relevant. + Format: 1,3,5 (or just 1 for single selection) + """ + + # Use Perplexity to select the best websites + response = await self.webQuery(selectionPrompt) + + # Parse the selection + import re + numbers = re.findall(r'\d+', response) + if numbers: + selectedWebsites = [] + for num in numbers: + index = int(num) - 1 + if 0 <= index < len(websites): + selectedWebsites.append(websites[index]) + + if selectedWebsites: + logger.info(f"AI selected {len(selectedWebsites)} websites") + return selectedWebsites, response + + # Fallback to first website + logger.warning("AI selection failed, using first website") + return websites[:1], f"AI selection failed, fallback to first website. AI response: {response}" + + except Exception as e: + logger.error(f"Error in website selection: {str(e)}") + return websites[:1], f"Error in website selection: {str(e)}" + + async def analyzeContentWithChunking(self, allContent: Dict[str, str], userQuestion: str) -> str: + """Analyze content using AI with chunking for large content.""" + logger.info(f"Analyzing {len(allContent)} websites with AI") + + # Process content in chunks to avoid token limits + chunkSize = 50000 # 50k chars per chunk + allChunks = [] + + for url, content in allContent.items(): + filteredContent = self._filterContent(content) + if len(filteredContent) <= chunkSize: + allChunks.append((url, filteredContent)) + logger.info(f"Content from {url}: {len(filteredContent)} chars (single chunk)") + else: + # Split large content into chunks + chunkCount = (len(filteredContent) + chunkSize - 1) // chunkSize + logger.info(f"Content from {url}: {len(filteredContent)} chars (split into {chunkCount} chunks)") + for i in range(0, len(filteredContent), chunkSize): + chunk = filteredContent[i:i+chunkSize] + chunkNum = i//chunkSize + 1 + allChunks.append((f"{url} (part {chunkNum})", chunk)) + + logger.info(f"Processing {len(allChunks)} content chunks") + + # Analyze each chunk + chunkAnalyses = [] + for i, (url, chunk) in enumerate(allChunks, 1): + logger.info(f"Analyzing chunk {i}/{len(allChunks)}: {url}") + + try: + analysisPrompt = f""" + Analyze this web content and extract relevant information for: {userQuestion} + + Source: {url} + Content: {chunk} + + Please extract key information relevant to the query. + """ + + analysis = await self.webQuery(analysisPrompt) + chunkAnalyses.append(analysis) + logger.info(f"Chunk {i}/{len(allChunks)} analyzed successfully") + + except Exception as e: + logger.error(f"Chunk {i}/{len(allChunks)} error: {e}") + + # Combine all chunk analyses + if chunkAnalyses: + logger.info(f"Combining {len(chunkAnalyses)} chunk analyses") + combinedAnalysis = "\n\n".join(chunkAnalyses) + + # Final synthesis + try: + logger.info("Performing final synthesis of all analyses") + synthesisPrompt = f""" + Based on these partial analyses, provide a comprehensive answer to: {userQuestion} + + Partial analyses: + {combinedAnalysis} + + Please provide a clear, well-structured answer to the query. + """ + + finalAnalysis = await self.webQuery(synthesisPrompt) + logger.info("Final synthesis completed successfully") + return finalAnalysis + + except Exception as e: + logger.error(f"Synthesis error: {e}") + return combinedAnalysis + else: + logger.error("No content could be analyzed") + return "No content could be analyzed" + + def _filterContent(self, content: str) -> str: + """Filter out navigation, ads, and other nonsense content.""" + lines = content.split('\n') + filteredLines = [] + + for line in lines: + line = line.strip() + # Skip empty lines + if not line: + continue + # Skip navigation elements + if any(skip in line.lower() for skip in [ + 'toggle navigation', 'log in', 'sign up', 'cookies', 'privacy policy', + 'terms of service', 'subscribe', 'newsletter', 'follow us', 'share this', + 'advertisement', 'sponsored', 'banner', 'popup', 'modal' + ]): + continue + # Skip image references without context + if line.startswith('![Image') and '](' in line: + continue + # Skip pure links without context + if line.startswith('[') and line.endswith(')') and '---' in line: + continue + # Keep meaningful content + if len(line) > 10: # Skip very short lines + filteredLines.append(line) + + return '\n'.join(filteredLines) + diff --git a/modules/interfaces/interfaceDbChatObjects.py b/modules/interfaces/interfaceDbChatObjects.py index 699f6bca..319c1703 100644 --- a/modules/interfaces/interfaceDbChatObjects.py +++ b/modules/interfaces/interfaceDbChatObjects.py @@ -12,8 +12,8 @@ from typing import Dict, Any, List, Optional, Union, get_origin, get_args import asyncio from modules.interfaces.interfaceDbChatAccess import ChatAccess -from modules.datamodels.datamodelWorkflow import ( - TaskAction, +from modules.datamodels.datamodelChat import ( + ActionItem, TaskResult, TaskItem, TaskStatus, @@ -549,7 +549,7 @@ class ChatObjects: created_documents.append(created_doc) # Convert to ChatMessage model - return ChatMessage( + chat_message = ChatMessage( id=createdMessage["id"], workflowId=createdMessage["workflowId"], parentMessageId=createdMessage.get("parentMessageId"), @@ -570,6 +570,11 @@ class ChatObjects: actionMethod=createdMessage.get("actionMethod"), actionName=createdMessage.get("actionName") ) + + # Debug: Store message and documents for debugging TODO REMOVE + self._storeDebugMessageAndDocuments(chat_message) + + return chat_message except Exception as e: logger.error(f"Error creating workflow message: {str(e)}") @@ -1045,6 +1050,120 @@ class ChatObjects: return {"items": items} + def _storeDebugMessageAndDocuments(self, message: ChatMessage) -> None: + """ + Store message and documents for debugging purposes in fileshare. + Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/documents + + Args: + message: ChatMessage object to store + """ + try: + import os + import json + from datetime import datetime, UTC + + # Create base debug directory + debug_root = "./test-chat/messages" + os.makedirs(debug_root, exist_ok=True) + + # Generate timestamp + timestamp = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3] + + # Create message folder name: m_round_task_action_timestamp + # Use actual values from message, not defaults + round_str = str(message.roundNumber) if message.roundNumber is not None else "0" + task_str = str(message.taskNumber) if message.taskNumber is not None else "0" + action_str = str(message.actionNumber) if message.actionNumber is not None else "0" + message_folder = f"{timestamp}_m_{round_str}_{task_str}_{action_str}" + + message_path = os.path.join(debug_root, message_folder) + os.makedirs(message_path, exist_ok=True) + + # Store message data - use dict() instead of model_dump() for compatibility + message_file = os.path.join(message_path, "message.json") + with open(message_file, "w", encoding="utf-8") as f: + # Convert message to dict manually to avoid model_dump() issues + message_dict = { + "id": message.id, + "workflowId": message.workflowId, + "parentMessageId": message.parentMessageId, + "message": message.message, + "role": message.role, + "status": message.status, + "sequenceNr": message.sequenceNr, + "publishedAt": message.publishedAt, + "roundNumber": message.roundNumber, + "taskNumber": message.taskNumber, + "actionNumber": message.actionNumber, + "documentsLabel": message.documentsLabel, + "actionId": message.actionId, + "actionMethod": message.actionMethod, + "actionName": message.actionName, + "success": message.success, + "documents": [] + } + json.dump(message_dict, f, indent=2, ensure_ascii=False, default=str) + + # Store message content as text + if message.message: + message_text_file = os.path.join(message_path, "message_text.txt") + with open(message_text_file, "w", encoding="utf-8") as f: + f.write(str(message.message)) + + # Store documents if provided + if message.documents and len(message.documents) > 0: + logger.info(f"Debug: Processing {len(message.documents)} documents") + + # Group documents by documentsLabel + documents_by_label = {} + for doc in message.documents: + label = message.documentsLabel or 'default' + if label not in documents_by_label: + documents_by_label[label] = [] + documents_by_label[label].append(doc) + + # Create subfolder for each document label + for label, docs in documents_by_label.items(): + # Sanitize label for filesystem + safe_label = "".join(c for c in str(label) if c.isalnum() or c in (' ', '-', '_')).rstrip() + safe_label = safe_label.replace(' ', '_') + if not safe_label: + safe_label = "default" + + label_folder = os.path.join(message_path, safe_label) + os.makedirs(label_folder, exist_ok=True) + logger.info(f"Debug: Created document folder: {label_folder}") + + # Store each document + for i, doc in enumerate(docs): + # Create document metadata file + doc_meta = { + "id": doc.id, + "messageId": doc.messageId, + "fileId": doc.fileId, + "fileName": doc.fileName, + "fileSize": doc.fileSize, + "mimeType": doc.mimeType, + "roundNumber": doc.roundNumber, + "taskNumber": doc.taskNumber, + "actionNumber": doc.actionNumber, + "actionId": doc.actionId + } + + doc_meta_file = os.path.join(label_folder, f"document_{i+1:03d}_metadata.json") + with open(doc_meta_file, "w", encoding="utf-8") as f: + json.dump(doc_meta, f, indent=2, ensure_ascii=False, default=str) + + logger.info(f"Debug: Stored document metadata for {doc.fileName}") + + logger.info(f"Debug: Stored message and documents in {message_path}") + + except Exception as e: + logger.error(f"Debug: Failed to store message and documents: {e}") + import traceback + logger.error(f"Debug: Traceback: {traceback.format_exc()}") + def getInterface(currentUser: Optional[User] = None) -> 'ChatObjects': """ diff --git a/modules/services/__init__.py b/modules/services/__init__.py index 8f7843e5..384ae6af 100644 --- a/modules/services/__init__.py +++ b/modules/services/__init__.py @@ -41,6 +41,8 @@ class Services: def __init__(self, user: User, workflow: ChatWorkflow = None): self.user: User = user self.workflow: ChatWorkflow = workflow + self.currentUserPrompt: str = "" # Cleaned/normalized user intent for the current round + self.rawUserPrompt: str = "" # Original raw user message for the current round # Initialize interfaces diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index 9d1e4735..16619a52 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -1,16 +1,16 @@ import logging from typing import Dict, Any, List, Optional, Tuple, Union +from modules.datamodels.datamodelChat import PromptPlaceholder from modules.datamodels.datamodelChat import ChatDocument from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, ModelCapabilities, OperationType, Priority from modules.datamodels.datamodelWeb import ( - WebSearchRequest, - WebCrawlRequest, - WebScrapeRequest, - WebSearchActionResult, - WebCrawlActionResult, - WebScrapeActionResult, + WebResearchRequest, + WebResearchActionResult, + WebResearchDocumentData, + WebResearchActionDocument, + WebSearchResultItem, ) from modules.interfaces.interfaceAiObjects import AiObjects @@ -34,54 +34,35 @@ class AiService: self.serviceCenter = serviceCenter # Only depend on interfaces self.aiObjects = None # Will be initialized in create() - self.extractionService = ExtractionService() + self._extractionService = None # Lazy initialization + + @property + def extractionService(self): + """Lazy initialization of extraction service.""" + if self._extractionService is None: + logger.info("Lazy initializing ExtractionService...") + self._extractionService = ExtractionService() + return self._extractionService + + async def _ensureAiObjectsInitialized(self): + """Ensure aiObjects is initialized.""" + if self.aiObjects is None: + logger.info("Lazy initializing AiObjects...") + self.aiObjects = await AiObjects.create() + logger.info("AiObjects initialization completed") @classmethod async def create(cls, serviceCenter=None) -> "AiService": """Create AiService instance with all connectors initialized.""" + logger.info("AiService.create() called") instance = cls(serviceCenter) + logger.info("AiService created, about to call AiObjects.create()...") instance.aiObjects = await AiObjects.create() + logger.info("AiObjects.create() completed") return instance - # AI Text Generation - async def callAiText( - self, - prompt: str, - documents: Optional[List[ChatDocument]] = None, - processDocumentsIndividually: bool = False, - options: Optional[AiCallOptions] = None, - ) -> str: - """Call AI for text generation using interface.call().""" - try: - documentContent = "" - if documents: - documentContent = await self._processDocumentsForAi( - documents, - options.operationType if options else "general", - options.compressContext if options else True, - options.processDocumentsIndividually if options else processDocumentsIndividually, - ) - - effectiveOptions = options or AiCallOptions() - # Compute maxContextBytes if not provided: conservative defaults per model tag could be added here - if options and options.maxContextBytes is None: - options.maxContextBytes = 16000 # bytes, conservative default if model limit unknown - - request = AiCallRequest( - prompt=prompt, - context=documentContent or None, - options=effectiveOptions, - ) - - response = await self.aiObjects.call(request) - return response.content - except Exception as e: - logger.error(f"Error in AI text generation: {str(e)}") - return f"Error: {str(e)}" - - # AI Image Analysis - async def callAiImage( + async def readImage( self, prompt: str, imageData: Union[str, bytes], @@ -111,55 +92,282 @@ class AiService: logger.error(f"Error in AI image generation: {str(e)}") return {"success": False, "error": str(e)} - # Web Research (using LangDoc AI) - async def webResearch( - self, - query: str, - context: str = "", - options: Optional[AiCallOptions] = None, - ) -> str: - """Perform web research using LangDoc AI via interface.webQuery().""" + # Web Research - Using interface functions + async def webResearch(self, request: WebResearchRequest) -> WebResearchActionResult: + """Perform web research using interface functions.""" try: - return await self.aiObjects.webQuery(query, context, options) + logger.info(f"WEB RESEARCH STARTED") + logger.info(f"User Query: {request.user_prompt}") + logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}") + + # Step 1: Find relevant websites - either provided URLs or AI-determined main URLs + logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===") + + if request.urls: + # Use provided URLs as initial main URLs + websites = request.urls + logger.info(f"Using provided URLs ({len(websites)}):") + for i, url in enumerate(websites, 1): + logger.info(f" {i}. {url}") + else: + # Use AI to determine main URLs based on user's intention + logger.info(f"AI analyzing user intent: '{request.user_prompt}'") + + # Use AI to generate optimized Tavily search query and search parameters + query_optimizer_prompt = f"""You are a search query optimizer. + + USER QUERY: {request.user_prompt} + + Your task: Create a search query and parameters for the USER QUERY given. + + RULES: + 1. The search query MUST be related to the user query above + 2. Extract key terms from the user query + 3. Determine appropriate country/language based on the query context + 4. Keep search query short (2-6 words) + + Return ONLY this JSON format: + {{ + "user_prompt": "search query based on user query above", + "country": "country_code_or_null", + "language": "language_code_or_null", + "topic": "general|news|academic_or_null", + "time_range": "d|w|m|y_or_null", + "selection_strategy": "single|multiple|specific_page", + "selection_criteria": "what URLs to prioritize", + "expected_url_patterns": ["pattern1", "pattern2"], + "estimated_result_count": number + }}""" + + # Get AI response for query optimization + ai_request = AiCallRequest( + prompt=query_optimizer_prompt, + options=AiCallOptions() + ) + ai_response_obj = await self.aiObjects.call(ai_request) + ai_response = ai_response_obj.content + logger.debug(f"AI query optimizer response: {ai_response}") + + # Parse AI response to extract search query + import json + try: + # Clean the response by removing markdown code blocks + cleaned_response = ai_response.strip() + if cleaned_response.startswith('```json'): + cleaned_response = cleaned_response[7:] # Remove ```json + if cleaned_response.endswith('```'): + cleaned_response = cleaned_response[:-3] # Remove ``` + cleaned_response = cleaned_response.strip() + + query_data = json.loads(cleaned_response) + search_query = query_data.get("user_prompt", request.user_prompt) + ai_country = query_data.get("country") + ai_language = query_data.get("language") + ai_topic = query_data.get("topic") + ai_time_range = query_data.get("time_range") + selection_strategy = query_data.get("selection_strategy", "multiple") + selection_criteria = query_data.get("selection_criteria", "relevant URLs") + expected_patterns = query_data.get("expected_url_patterns", []) + estimated_count = query_data.get("estimated_result_count", request.max_results) + + logger.info(f"AI optimized search query: '{search_query}'") + logger.info(f"Selection strategy: {selection_strategy}") + logger.info(f"Selection criteria: {selection_criteria}") + logger.info(f"Expected URL patterns: {expected_patterns}") + logger.info(f"Estimated result count: {estimated_count}") + + except json.JSONDecodeError: + logger.warning("Failed to parse AI response as JSON, using original query") + search_query = request.user_prompt + ai_country = None + ai_language = None + ai_topic = None + ai_time_range = None + selection_strategy = "multiple" + + # Perform the web search with AI-determined parameters + search_kwargs = { + "query": search_query, + "max_results": request.max_results, + "search_depth": request.options.search_depth, + "auto_parameters": False # Use explicit parameters + } + + # Add parameters only if they have valid values + if ai_country and ai_country not in ['null', '', 'none', 'undefined']: + search_kwargs["country"] = ai_country + elif request.options.country and request.options.country not in ['null', '', 'none', 'undefined']: + search_kwargs["country"] = request.options.country + + if ai_language and ai_language not in ['null', '', 'none', 'undefined']: + search_kwargs["language"] = ai_language + elif request.options.language and request.options.language not in ['null', '', 'none', 'undefined']: + search_kwargs["language"] = request.options.language + + if ai_topic and ai_topic in ['general', 'news', 'academic']: + search_kwargs["topic"] = ai_topic + elif request.options.topic and request.options.topic in ['general', 'news', 'academic']: + search_kwargs["topic"] = request.options.topic + + if ai_time_range and ai_time_range in ['d', 'w', 'm', 'y']: + search_kwargs["time_range"] = ai_time_range + elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']: + search_kwargs["time_range"] = request.options.time_range + + # Log the parameters being used + logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}") + + search_results = await self.aiObjects.search_websites(**search_kwargs) + + logger.debug(f"Web search returned {len(search_results)} results:") + for i, result in enumerate(search_results, 1): + logger.debug(f" {i}. {result.url} - {result.title}") + + # Deduplicate while preserving order + seen = set() + search_urls = [] + for r in search_results: + u = str(r.url) + if u not in seen: + seen.add(u) + search_urls.append(u) + + if not search_urls: + logger.error("No relevant websites found") + return WebResearchActionResult(success=False, error="No relevant websites found") + + # Now use AI to determine the main URLs based on user's intention + logger.info(f"AI selecting main URLs from {len(search_urls)} search results based on user intent") + + # Create a prompt for AI to identify main URLs based on user's intention + ai_prompt = f""" + Select the most relevant URLs from these search results: + + {chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])} + + Return only the URLs that are most relevant for the user's query. + One URL per line. + """ + # Create AI call request + ai_request = AiCallRequest( + prompt=ai_prompt, + options=AiCallOptions() + ) + ai_response_obj = await self.aiObjects.call(ai_request) + ai_response = ai_response_obj.content + logger.debug(f"AI response for main URL selection: {ai_response}") + + # Parse AI response to extract URLs + websites = [] + for line in ai_response.strip().split('\n'): + line = line.strip() + if line and ('http://' in line or 'https://' in line): + # Extract URL from the line + for word in line.split(): + if word.startswith('http://') or word.startswith('https://'): + websites.append(word.rstrip('.,;')) + break + + if not websites: + logger.warning("AI did not identify any main URLs, using first few search results") + websites = search_urls[:3] # Fallback to first 3 search results + + # Deduplicate while preserving order + seen = set() + unique_websites = [] + for url in websites: + if url not in seen: + seen.add(url) + unique_websites.append(url) + + websites = unique_websites + + logger.info(f"AI selected {len(websites)} main URLs (after deduplication):") + for i, url in enumerate(websites, 1): + logger.info(f" {i}. {url}") + + # Step 2: Smart website selection using AI interface + logger.info(f"=== STEP 2: FILTERED URL LIST BY USER PROMPT'S INTENTION ===") + logger.info(f"AI analyzing {len(websites)} URLs for relevance to: '{request.user_prompt}'") + + selectedWebsites, aiResponse = await self.aiObjects.selectRelevantWebsites(websites, request.user_prompt) + + logger.debug(f"AI Response: {aiResponse}") + logger.debug(f"AI selected {len(selectedWebsites)} most relevant URLs:") + for i, url in enumerate(selectedWebsites, 1): + logger.debug(f" {i}. {url}") + + # Show which were filtered out + filtered_out = [url for url in websites if url not in selectedWebsites] + if filtered_out: + logger.debug(f"Filtered out {len(filtered_out)} less relevant URLs:") + for i, url in enumerate(filtered_out, 1): + logger.debug(f" {i}. {url}") + + # Step 3+4+5: Recursive crawling with configurable depth + logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {request.options.pages_search_depth}) ===") + logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...") + logger.info(f"Search depth: {request.options.pages_search_depth} levels") + logger.info(f"DEBUG: request.options.pages_search_depth = {request.options.pages_search_depth}") + + # Use recursive crawling with URL index to avoid duplicates + allContent = await self.aiObjects.crawlRecursively( + urls=selectedWebsites, + max_depth=request.options.pages_search_depth, + extract_depth=request.options.extract_depth, + max_per_domain=10 + ) + + if not allContent: + logger.error("Could not extract content from any websites") + return WebResearchActionResult(success=False, error="Could not extract content from any websites") + + logger.info(f"=== WEB RESEARCH COMPLETED ===") + logger.info(f"Successfully crawled {len(allContent)} URLs total") + logger.info(f"Crawl depth: {request.options.pages_search_depth} levels") + + # Create simple result with raw content + sources = [WebSearchResultItem(title=url, url=url) for url in selectedWebsites] + + # Get all additional links (all URLs except main ones) + additional_links = [url for url in allContent.keys() if url not in selectedWebsites] + + # Combine all content into a single result + combinedContent = "" + for url, content in allContent.items(): + combinedContent += f"\n\n=== {url} ===\n{content}\n" + + documentData = WebResearchDocumentData( + user_prompt=request.user_prompt, + websites_analyzed=len(allContent), + additional_links_found=len(additional_links), + analysis_result=combinedContent, # Raw content, no analysis + sources=sources, + additional_links=additional_links, + individual_content=allContent, # Individual URL -> content mapping + debug_info={ + "crawl_depth": request.options.pages_search_depth, + "total_urls_crawled": len(allContent), + "main_urls": len(selectedWebsites), + "additional_urls": len(additional_links) + } + ) + + document = WebResearchActionDocument( + documentName=f"web_research_{request.user_prompt[:50]}.json", + documentData=documentData, + mimeType="application/json" + ) + + return WebResearchActionResult( + success=True, + documents=[document], + resultLabel="web_research_results" + ) + except Exception as e: logger.error(f"Error in web research: {str(e)}") - return f"Error: {str(e)}" - - # Web Search (using Tavily) - async def webSearch( - self, - request: WebSearchRequest, - ) -> WebSearchActionResult: - """Perform web search using Tavily via interface.webSearch().""" - try: - return await self.aiObjects.webSearch(request) - except Exception as e: - logger.error(f"Error in web search: {str(e)}") - return WebSearchActionResult(success=False, error=str(e)) - - # Web Crawl (using Tavily) - async def webCrawl( - self, - request: WebCrawlRequest, - ) -> WebCrawlActionResult: - """Crawl web pages using Tavily via interface.webCrawl().""" - try: - return await self.aiObjects.webCrawl(request) - except Exception as e: - logger.error(f"Error in web crawl: {str(e)}") - return WebCrawlActionResult(success=False, error=str(e)) - - # Web Scrape (using Tavily) - async def webScrape( - self, - request: WebScrapeRequest, - ) -> WebScrapeActionResult: - """Scrape web content using Tavily via interface.webScrape().""" - try: - return await self.aiObjects.webScrape(request) - except Exception as e: - logger.error(f"Error in web scrape: {str(e)}") - return WebScrapeActionResult(success=False, error=str(e)) + return WebResearchActionResult(success=False, error=str(e)) async def _processDocumentsForAi( self, @@ -167,50 +375,85 @@ class AiService: operationType: str, compressDocuments: bool, processIndividually: bool, + userPrompt: str, + options: Optional[AiCallOptions] = None ) -> str: if not documents: return "" - # Build extraction options + # Calculate model-derived size limits + maxContextBytes = self._calculateMaxContextBytes(options) + + # Build extraction options with model-derived limits extractionOptions: Dict[str, Any] = { - "prompt": f"Extract relevant content for {operationType}", + "prompt": f"Extract content that supports the user's request: '{userPrompt}'. Focus on information relevant to: {operationType}", "operationType": operationType, "processDocumentsIndividually": processIndividually, - # Respect size/ chunking hints if provided via AiCallOptions - "maxSize": getattr(getattr(self, "_aiOptions", None), "maxContextBytes", None) or 0, - "chunkAllowed": getattr(getattr(self, "_aiOptions", None), "chunkAllowed", True), - # basic merge strategy for text by parent - "mergeStrategy": {"groupBy": "parentId", "orderBy": "pageIndex"}, + "maxSize": maxContextBytes, + "chunkAllowed": not options.compressContext if options else True, + "textChunkSize": int(maxContextBytes * 0.3), # 30% of max for text chunks + "imageChunkSize": int(maxContextBytes * 0.5), # 50% of max for image chunks + "imageMaxPixels": 1024 * 1024, # 1MP default + "imageQuality": 85, + "mergeStrategy": { + "groupBy": "typeGroup", + "orderBy": "id", + "mergeType": "concatenate" + }, } - # Prepare documentList for extractor - documentList: List[Dict[str, Any]] = [] - for d in documents: - documentList.append({ - "id": d.id, - "bytes": d.fileData, - "fileName": d.fileName, - "mimeType": d.mimeType, - }) - processedContents: List[str] = [] try: - extractionResult = self.extractionService.extractContent(documentList, extractionOptions) + # Use new ChatDocument-based API + logger.info(f"=== PROCESSING {len(documents)} DOCUMENTS FOR AI ===") + for i, doc in enumerate(documents): + logger.info(f"Document {i}: {doc.fileName} (MIME: {doc.mimeType})") + + extractionResult = self.extractionService.extractContent(documents, extractionOptions) + logger.info(f"Extraction completed: {len(extractionResult)} results") - def _partsToText(parts) -> str: + async def _partsToText(parts, documentName: str, documentType: str, logger_ref) -> str: lines: List[str] = [] + logger_ref.debug(f"Processing {len(parts)} content parts for {documentName}") + for p in parts: + logger_ref.debug(f" Part: {p.typeGroup} ({p.mimeType}) - {len(p.data) if p.data else 0} chars") + if p.typeGroup in ("text", "table", "structure") and p.data and isinstance(p.data, str): lines.append(p.data) + elif p.typeGroup == "image" and p.data: + # Use AI to extract text from image with user prompt + logger_ref.debug(f" Processing image with AI using user prompt...") + try: + imageResult = await self.aiObjects.callImage( + prompt=userPrompt, + imageData=p.data, + mimeType=p.mimeType + ) + lines.append(f"[Image Analysis]: {imageResult}") + logger_ref.debug(f" AI image analysis completed: {len(imageResult)} chars") + except Exception as e: + logger_ref.warning(f" AI image processing failed: {e}") + lines.append(f"[Image Analysis Failed]: {str(e)}") return "\n\n".join(lines) if isinstance(extractionResult, list): for i, ec in enumerate(extractionResult): try: - contentText = _partsToText(ec.parts) + # Get document info for this extraction result + doc = documents[i] if i < len(documents) else None + docName = doc.fileName if doc else f"Document_{i}" + docType = doc.mimeType if doc else "unknown" + + contentText = await _partsToText(ec.parts, docName, docType, logger) + logger.debug(f"Document {i} content: {len(contentText)} chars") + if compressDocuments and len(contentText.encode("utf-8")) > 10000: + originalLength = len(contentText) contentText = await self._compressContent(contentText, 10000, "document") + logger.debug(f"Document {i} compressed: {originalLength} -> {len(contentText)} chars") + processedContents.append(contentText) except Exception as e: logger.warning(f"Error aggregating extracted content: {str(e)}") @@ -225,7 +468,215 @@ class AiService: logger.warning(f"Error during extraction: {str(e)}") processedContents.append("[Error during extraction]") - return "\n\n---\n\n".join(processedContents) + # Build JSON structure ONLY when adding to AI prompt + import json + documentsJson = [] + for i, content in enumerate(processedContents): + doc = documents[i] if i < len(documents) else None + docName = doc.fileName if doc else f"Document_{i}" + docType = doc.mimeType if doc else "unknown" + + documentData = { + "documentName": docName, + "documentType": docType, + "content": content + } + documentsJson.append(documentData) + + finalContext = json.dumps({ + "documents": documentsJson, + "totalDocuments": len(documentsJson) + }, indent=2, ensure_ascii=False) + + logger.debug(f"=== FINAL CONTEXT ===") + logger.debug(f"Total context: {len(finalContext)} chars") + logger.debug(f"Documents: {len(documentsJson)}") + return finalContext + + def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int: + """Calculate maximum context bytes based on model capabilities and options.""" + if options and options.maxContextBytes: + return options.maxContextBytes + + # Default model capabilities (this should be enhanced with actual model registry) + defaultMaxTokens = 4000 + safetyMargin = options.safetyMargin if options else 0.1 + + # Calculate bytes (4 chars per token estimation) + maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4) + + return maxContextBytes + + async def _processDocumentsPerChunk( + self, + documents: List[ChatDocument], + prompt: str, + options: Optional[AiCallOptions] = None + ) -> str: + """ + Process documents with per-chunk AI calls and merge results. + + Args: + documents: List of ChatDocument objects to process + prompt: AI prompt for processing + options: AI call options + + Returns: + Merged AI results as string + """ + if not documents: + return "" + + # Get model capabilities for size calculation + model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) + + # Build extraction options for chunking + extractionOptions: Dict[str, Any] = { + "prompt": prompt, + "operationType": options.operationType if options else "general", + "processDocumentsIndividually": True, # Process each document separately + "maxSize": model_capabilities["maxContextBytes"], + "chunkAllowed": True, + "textChunkSize": model_capabilities["textChunkSize"], + "imageChunkSize": model_capabilities["imageChunkSize"], + "imageMaxPixels": 1024 * 1024, + "imageQuality": 85, + "mergeStrategy": { + "groupBy": "typeGroup", + "orderBy": "id", + "mergeType": "concatenate" + }, + } + + logger.debug(f"Per-chunk extraction options: {extractionOptions}") + + try: + # Extract content with chunking + extractionResult = self.extractionService.extractContent(documents, extractionOptions) + + if not isinstance(extractionResult, list): + return "[Error: No extraction results]" + + # Prepare debug directory TODO TO REMOVE + import os + from datetime import datetime, UTC + debug_root = "./test-chat/ai" + ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3] + debug_dir = os.path.join(debug_root, f"{ts}_extraction_per_chunk") + try: + os.makedirs(debug_dir, exist_ok=True) + except Exception: + pass + + # Process each chunk with AI + aiResults: List[str] = [] + + for ec in extractionResult: + for part in ec.parts: + if part.typeGroup == "image": + # Process image with AI + try: + # Safety check for part.data + if not hasattr(part, 'data') or part.data is None: + logger.warning(f"Skipping image chunk with no data") + continue + + aiResult = await self.readImage( + prompt=prompt, + imageData=part.data, + mimeType=part.mimeType, + options=options + ) + aiResults.append(aiResult) + except Exception as e: + logger.warning(f"Error processing image chunk: {str(e)}") + aiResults.append(f"[Error processing image: {str(e)}]") + + elif part.typeGroup in ("text", "table", "structure"): + # Process text content with AI + try: + # Safety check for part.data + if not hasattr(part, 'data') or part.data is None: + logger.warning(f"Skipping chunk with no data") + continue + + logger.info(f"=== PROCESSING CHUNK {len(aiResults) + 1} ===") + logger.info(f"Chunk size: {len(part.data)} chars") + logger.info(f"Chunk preview: {part.data[:200]}...") + + # Dump input chunk + try: + idx = len(aiResults) + 1 + fpath = os.path.join(debug_dir, f"chunk_{idx:03d}_input.txt") + with open(fpath, "w", encoding="utf-8") as f: + f.write(str(part.data)) + except Exception: + pass + + # Create AI call request for this chunk + request = AiCallRequest( + prompt=prompt, + context=part.data, + options=options + ) + + # Make the call using AiObjects + response = await self.aiObjects.call(request) + aiResults.append(response.content) + + logger.info(f"Chunk {len(aiResults)} processed: {len(response.content)} chars response") + # Dump AI response + try: + idx = len(aiResults) + fpath = os.path.join(debug_dir, f"chunk_{idx:03d}_response.txt") + with open(fpath, "w", encoding="utf-8") as f: + f.write(str(response.content)) + except Exception: + pass + + except Exception as e: + logger.warning(f"Error processing text chunk: {str(e)}") + aiResults.append(f"[Error processing text: {str(e)}]") + + # Merge AI results using ExtractionService + from modules.datamodels.datamodelExtraction import MergeStrategy + + mergeStrategy = MergeStrategy( + groupBy="typeGroup", + orderBy="id", + mergeType="concatenate", + chunkSeparator="\n\n---\n\n" + ) + + mergedContent = self.extractionService.mergeAiResults( + extractionResult, + aiResults, + mergeStrategy + ) + + # Extract only AI-generated text from merged content + resultText = "" + for part in mergedContent.parts: + if ( + part.typeGroup in ("text", "table", "structure") + and part.data + and getattr(part, "metadata", {}).get("aiResult", False) + ): + resultText += part.data + "\n\n" + + # Dump merged output + try: + fpath = os.path.join(debug_dir, "merged_output.txt") + with open(fpath, "w", encoding="utf-8") as f: + f.write(resultText.strip()) + except Exception: + pass + + return resultText.strip() + + except Exception as e: + logger.error(f"Error in per-chunk processing: {str(e)}") + return f"[Error in per-chunk processing: {str(e)}]" async def _compressContent(self, content: str, targetSize: int, contentType: str) -> str: if len(content.encode("utf-8")) <= targetSize: @@ -254,44 +705,132 @@ class AiService: self, prompt: str, documents: Optional[List[ChatDocument]] = None, - placeholders: Optional[Dict[str, str]] = None, - options: Optional[AiCallOptions] = None - ) -> str: + placeholders: Optional[List[PromptPlaceholder]] = None, + options: Optional[AiCallOptions] = None, + outputFormat: Optional[str] = None, + title: Optional[str] = None + ) -> Union[str, Dict[str, Any]]: """ Unified AI call interface that automatically routes to appropriate handler. Args: prompt: The main prompt for the AI call documents: Optional list of documents to process - placeholders: Optional dictionary of placeholder replacements for planning calls + placeholders: Optional list of placeholder replacements for planning calls options: AI call configuration options + outputFormat: Optional output format (html, pdf, docx, txt, md, json, csv, xlsx) for document generation + title: Optional title for generated documents Returns: - AI response as string + AI response as string, or dict with documents if outputFormat is specified Raises: Exception: If all available models fail """ + # Ensure aiObjects is initialized + await self._ensureAiObjectsInitialized() + if options is None: options = AiCallOptions() + # Normalize placeholders from List[PromptPlaceholder] + placeholders_dict: Dict[str, str] = {} + placeholders_meta: Dict[str, bool] = {} + if placeholders: + placeholders_dict = {p.label: p.content for p in placeholders} + placeholders_meta = {p.label: bool(getattr(p, 'summaryAllowed', False)) for p in placeholders} + # Auto-determine call type based on documents and operation type call_type = self._determineCallType(documents, options.operationType) options.callType = call_type + # Log the prompt being sent to AI for debugging (before routing) TODO TO REMOVE + try: + # Build the full prompt that will be sent to AI + if placeholders: + full_prompt = prompt + for p in placeholders: + placeholder = f"{{{{KEY:{p.label}}}}}" + full_prompt = full_prompt.replace(placeholder, p.content) + else: + full_prompt = prompt + + self._writeAiResponseDebug( + label='ai_prompt_debug', + content=full_prompt, + partIndex=1, + modelName=None, + continuation=False + ) + except Exception: + pass + + # Handle document generation with specific output format + if outputFormat: + result = await self._callAiWithDocumentGeneration(prompt, documents, options, outputFormat, title) + # Log AI response for debugging TODO TO REMOVE + try: + if isinstance(result, dict) and 'content' in result: + self._writeAiResponseDebug( + label='ai_document_generation', + content=result['content'], + partIndex=1, + modelName=None, # Document generation doesn't return model info + continuation=False + ) + except Exception: + pass + return result + if call_type == "planning": - return await self._callAiPlanning(prompt, placeholders, options) + result = await self._callAiPlanning(prompt, placeholders_dict, placeholders_meta, options) + # Log AI response for debugging TODO TO REMOVE + try: + self._writeAiResponseDebug( + label='ai_planning', + content=result or "", + partIndex=1, + modelName=None, # Planning doesn't return model info + continuation=False + ) + except Exception: + pass + return result else: - return await self._callAiText(prompt, documents, options) + # Set processDocumentsIndividually from the legacy parameter if not set in options + if options.processDocumentsIndividually is None and documents: + options.processDocumentsIndividually = False # Default to batch processing + + # For text calls, we need to build the full prompt with placeholders here + # since _callAiText doesn't handle placeholders directly + if placeholders_dict: + full_prompt = self._buildPromptWithPlaceholders(prompt, placeholders_dict) + else: + full_prompt = prompt + + result = await self._callAiText(full_prompt, documents, options) + # Log AI response for debugging (additional logging for text calls) TODO TO REMOVE + try: + self._writeAiResponseDebug( + label='ai_text_main', + content=result or "", + partIndex=1, + modelName=None, # Text calls already log internally + continuation=False + ) + except Exception: + pass + return result def _determineCallType(self, documents: Optional[List[ChatDocument]], operation_type: str) -> str: """ Determine call type based on documents and operation type. - Criteria: no documents AND (operationType is "generate_plan" or "analyse_content") -> planning + Criteria: no documents AND operationType is "generate_plan" -> planning + All other cases -> text """ has_documents = documents is not None and len(documents) > 0 - is_planning_operation = operation_type in [OperationType.GENERATE_PLAN, OperationType.ANALYSE_CONTENT] + is_planning_operation = operation_type == OperationType.GENERATE_PLAN if not has_documents and is_planning_operation: return "planning" @@ -302,117 +841,378 @@ class AiService: self, prompt: str, placeholders: Optional[Dict[str, str]], + placeholdersMeta: Optional[Dict[str, bool]], options: AiCallOptions - ) -> str: + ) -> str: """ Handle planning calls with placeholder system and selective summarization. """ - # Get available models for planning (text + reasoning capabilities) - models = self._getModelsForOperation("planning", options) + # Ensure aiObjects is initialized + await self._ensureAiObjectsInitialized() - for model in models: + # Build full prompt with placeholders; if too large, summarize summaryAllowed placeholders proportionally + effective_placeholders = placeholders or {} + full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders) + + if options.compressPrompt and placeholdersMeta: + # Determine model capacity try: - # Build full prompt with placeholders - full_prompt = self._buildPromptWithPlaceholders(prompt, placeholders) - - # Check size and reduce if needed - if self._exceedsTokenLimit(full_prompt, model, options.safetyMargin): - full_prompt = self._reducePlanningPrompt(full_prompt, placeholders, model, options) - - # Make AI call using existing callAiText - result = await self.callAiText( - prompt=full_prompt, - documents=None, - options=options - ) - return result - - except Exception as e: - logger.warning(f"Planning model {model.name} failed: {e}") - continue + caps = self._getModelCapabilitiesForContent(full_prompt, None, options) + max_bytes = caps.get("maxContextBytes", len(full_prompt.encode("utf-8"))) + except Exception: + max_bytes = len(full_prompt.encode("utf-8")) + + current_bytes = len(full_prompt.encode("utf-8")) + if current_bytes > max_bytes: + # Compute total bytes contributed by allowed placeholders (approximate by content length) + allowed_labels = [l for l, allow in placeholdersMeta.items() if allow] + allowed_sizes = {l: len((effective_placeholders.get(l) or "").encode("utf-8")) for l in allowed_labels} + total_allowed = sum(allowed_sizes.values()) + + overage = current_bytes - max_bytes + if total_allowed > 0 and overage > 0: + # Target total for allowed after reduction + target_allowed = max(total_allowed - overage, 0) + # Global ratio to apply across allowed placeholders + ratio = target_allowed / total_allowed if total_allowed > 0 else 1.0 + ratio = max(0.0, min(1.0, ratio)) + + reduced: Dict[str, str] = {} + for label, content in effective_placeholders.items(): + if label in allowed_labels and isinstance(content, str) and len(content) > 0: + old_len = len(content) + # Reduce by proportional ratio on characters (fallback if empty) + reduction_factor = ratio if old_len > 0 else 1.0 + reduced[label] = self._reduceText(content, reduction_factor) + else: + reduced[label] = content + + effective_placeholders = reduced + full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders) + + # If still slightly over, perform a second-pass fine adjustment with updated ratio + current_bytes = len(full_prompt.encode("utf-8")) + if current_bytes > max_bytes and total_allowed > 0: + overage2 = current_bytes - max_bytes + # Recompute allowed sizes after first reduction + allowed_sizes2 = {l: len((effective_placeholders.get(l) or "").encode("utf-8")) for l in allowed_labels} + total_allowed2 = sum(allowed_sizes2.values()) + if total_allowed2 > 0 and overage2 > 0: + target_allowed2 = max(total_allowed2 - overage2, 0) + ratio2 = target_allowed2 / total_allowed2 + ratio2 = max(0.0, min(1.0, ratio2)) + reduced2: Dict[str, str] = {} + for label, content in effective_placeholders.items(): + if label in allowed_labels and isinstance(content, str) and len(content) > 0: + old_len = len(content) + reduction_factor = ratio2 if old_len > 0 else 1.0 + reduced2[label] = self._reduceText(content, reduction_factor) + else: + reduced2[label] = content + effective_placeholders = reduced2 + full_prompt = self._buildPromptWithPlaceholders(prompt, effective_placeholders) - raise Exception("All planning models failed - check model availability and capabilities") + + # Make AI call using AiObjects (let it handle model selection) + request = AiCallRequest( + prompt=full_prompt, + context="", # Context is already included in the prompt + options=options + ) + response = await self.aiObjects.call(request) + try: + logger.debug(f"AI model selected (planning): {getattr(response, 'modelName', 'unknown')}") + except Exception: + pass + return response.content async def _callAiText( self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions - ) -> str: + ) -> str: """ Handle text calls with document processing through ExtractionService. """ - # Get available models for text processing - models = self._getModelsForOperation("text", options) + # Ensure aiObjects is initialized + await self._ensureAiObjectsInitialized() - for model in models: - try: - # Extract and process documents using ExtractionService + # Determine processing strategy based on options + if options.processDocumentsIndividually and documents: + # Use per-chunk processing for individual document processing + return await self._processDocumentsPerChunk(documents, prompt, options) + + # Check if we need chunking - if so, use per-chunk processing + if documents and not options.compressContext: + # Get model capabilities to check if chunking will be needed + model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) + total_doc_size = sum(doc.fileSize or 0 for doc in documents) + + if total_doc_size > model_capabilities["maxContextBytes"]: + logger.info(f"Document size ({total_doc_size}) exceeds model capacity ({model_capabilities['maxContextBytes']}), using per-chunk processing") + return await self._processDocumentsPerChunk(documents, prompt, options) + + # Extract and process documents using ExtractionService + context = "" + if documents: + logger.info(f"=== EXTRACTING CONTENT FROM {len(documents)} DOCUMENTS ===") + + # Get model capabilities for size calculation + model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) + + # Use new ChatDocument-based API + extraction_options = { + "prompt": prompt, + "operationType": options.operationType, + "processDocumentsIndividually": options.processDocumentsIndividually, + "maxSize": options.maxContextBytes or model_capabilities["maxContextBytes"], + "chunkAllowed": not options.compressContext, + "textChunkSize": model_capabilities["textChunkSize"], + "imageChunkSize": model_capabilities["imageChunkSize"], + "imageMaxPixels": 1024 * 1024, + "imageQuality": 85, + "mergeStrategy": {"groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate"} + } + + logger.debug(f"Extraction options: {extraction_options}") + + extracted_content = self.extractionService.extractContent( + documents=documents, + options=extraction_options + ) + + logger.info(f"Extraction completed: {len(extracted_content)} documents") + + # Build context from list of extracted content + if isinstance(extracted_content, list): + context_parts = [] + chunk_count = 0 + for ec in extracted_content: + for p in ec.parts: + if p.typeGroup in ["text", "table", "structure"] and p.data: + if p.metadata.get("chunk", False): + chunk_count += 1 + context_parts.append(p.data) + elif p.typeGroup == "image" and p.data: + # Process image with AI using user prompt + try: + imageResult = await self.aiObjects.callImage( + prompt=prompt, + imageData=p.data, + mimeType=p.mimeType + ) + context_parts.append(f"[Image Analysis]: {imageResult}") + except Exception as e: + logger.warning(f"AI image processing failed: {e}") + context_parts.append(f"[Image Analysis Failed]: {str(e)}") + + if chunk_count > 0: + logger.debug(f"=== PROCESSING CHUNKED CONTENT ===") + logger.debug(f"Total chunks: {chunk_count}") + logger.debug(f"Total context parts: {len(context_parts)}") + + context = "\n\n---\n\n".join(context_parts) + else: context = "" - if documents: - # Convert ChatDocument to documentList format for ExtractionService - documentList = [{ - "id": d.id, - "bytes": d.fileData, - "fileName": d.fileName, - "mimeType": d.mimeType - } for d in documents] - - extracted_content = await self.extractionService.extractContent( - documentList=documentList, - options={ - "prompt": prompt, - "operationType": options.operationType, - "processDocumentsIndividually": options.processDocumentsIndividually, - "maxSize": options.maxContextBytes or int(model.maxTokens * 0.9), - "chunkAllowed": not options.compressContext, - "mergeStrategy": {"groupBy": "typeGroup"} - } - ) - - # Build context from list of ExtractedContent - if isinstance(extracted_content, list): - context = "\n\n---\n\n".join([ - "\n\n".join([ - p.data for p in ec.parts if p.typeGroup in ["text", "table", "structure"] and p.data - ]) for ec in extracted_content - ]) - else: - context = "" - - # Check size and reduce if needed - full_prompt = prompt + "\n\n" + context if context else prompt - if self._exceedsTokenLimit(full_prompt, model, options.safetyMargin): - full_prompt = self._reduceTextPrompt(prompt, context, model, options) - - # Make AI call using existing callAiText - result = await self.callAiText( - prompt=full_prompt, - documents=None, + + # Check size and reduce if needed + full_prompt = prompt + "\n\n" + context if context else prompt + + # Add generic completeness guidance: first vs subsequent (based on presence of context) + try: + if context and context.strip(): + # Subsequent calls with prior context: continue next part only + full_prompt += ( + "\n\nINSTRUCTIONS (COMPLETENESS):\n" + "- Continue from where the previous content ended. Do NOT repeat earlier content.\n" + "- If more parts are still needed after this response, the LAST LINE of your response MUST be exactly: 'CONTINUATION: true'.\n" + "- If the content is now complete, the LAST LINE of your response MUST be exactly: 'CONTINUATION: false'.\n" + "- The continuation line MUST be the final line of your output. Do NOT output anything after it (no notes, no explanations).\n" + ) + else: + # First call (no prior context): deliver full content or first part + full_prompt += ( + "\n\nINSTRUCTIONS (COMPLETENESS):\n" + "- Deliver the complete content. Do NOT truncate.\n" + "- If platform limits force truncation, provide the first complete section(s) only and ensure the LAST LINE of your response is exactly: 'CONTINUATION: true'.\n" + "- If the entire content is fully included, ensure the LAST LINE of your response is exactly: 'CONTINUATION: false'.\n" + "- The continuation line MUST be the final line of your output. Do NOT output anything after it (no notes, no explanations).\n" + ) + except Exception: + # Non-fatal if any issue building guidance + pass + logger.debug(f"AI call: {len(full_prompt)} chars (prompt: {len(prompt)}, context: {len(context)})") + + # Use AiObjects to select the best model and make the call + try: + # Helper to detect and strip continuation flag + import re + def _split_content_and_flag(text: str) -> (str, bool): + if not text: + return "", False + lines = text.strip().splitlines() + cont = False + # Scan last 3 lines for flag to be robust + for i in range(1, min(4, len(lines))+1): + m = re.match(r"^\s*CONTINUATION:\s*(true|false)\s*$", lines[-i].strip(), re.IGNORECASE) + if m: + cont = m.group(1).lower() == 'true' + # remove the matched flag line + del lines[-i] + break + return "\n".join(lines).strip(), cont + + # First call + request = AiCallRequest( + prompt=full_prompt, + context="", + options=options + ) + response = await self.aiObjects.call(request) + try: + logger.debug(f"AI model selected (text): {getattr(response, 'modelName', 'unknown')}") + except Exception: + pass + content_first = response.content or "" + merged_content, needs_more = _split_content_and_flag(content_first) + + # Iteratively request next parts if flagged + # Allow configurable max parts via options; default = 1000 + try: + max_parts = int(getattr(options, 'maxParts', 1000) or 1000) + except Exception: + max_parts = 1000 + part_index = 1 + while needs_more and part_index < max_parts: + part_index += 1 + # Build subsequent prompt with explicit continuation instructions + subsequent_prompt = ( + prompt + + "\n\nINSTRUCTIONS (CONTINUE NEXT PART ONLY):\n" + "- Continue from where the previous content ended.\n" + "- Do NOT repeat earlier content.\n" + "- The LAST LINE of your response MUST be exactly one of: 'CONTINUATION: true' (if more parts are needed) or 'CONTINUATION: false' (if complete).\n" + "- The continuation line MUST be the final line of your output. Do NOT output anything after it (no notes, no explanations).\n" + ) + next_request = AiCallRequest( + prompt=subsequent_prompt, + context=merged_content, options=options ) - return result - - except Exception as e: - logger.warning(f"Text model {model.name} failed: {e}") - continue + next_response = await self.aiObjects.call(next_request) + part_text = next_response.content or "" + part_clean, needs_more = _split_content_and_flag(part_text) + if part_clean: + # Separate parts clearly + merged_content = (merged_content + "\n\n" + part_clean).strip() + else: + # Avoid infinite loops on empty parts + break + + logger.debug(f"=== AI RESPONSE (MERGED) ===") + logger.debug(f"Response length: {len(merged_content)} chars") + logger.debug(f"Response preview: {merged_content[:200]}...") + return merged_content + + except Exception as e: + logger.error(f"AI call failed: {e}") + raise Exception(f"AI call failed: {e}") + + + def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]: + """ + Get model capabilities for content processing, including appropriate size limits for chunking. + """ + # Estimate total content size + prompt_size = len(prompt.encode('utf-8')) + document_size = 0 + if documents: + # Rough estimate of document content size + for doc in documents: + document_size += doc.fileSize or 0 - raise Exception("All text models failed - check model availability and capabilities") + total_size = prompt_size + document_size + + # Use AiObjects to select the best model for this content size + # We'll simulate the model selection by checking available models + from modules.interfaces.interfaceAiObjects import aiModels + + # Find the best model for this content size and operation + best_model = None + best_context_length = 0 + + for model_name, model_info in aiModels.items(): + context_length = model_info.get("contextLength", 0) + + # Skip models with no context length or too small for content + if context_length == 0: + continue + + # Check if model supports the operation type + capabilities = model_info.get("capabilities", []) + if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities: + continue + elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities: + continue + elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities: + continue + elif "text_generation" not in capabilities: + continue + + # Prefer models that can handle the content without chunking, but allow chunking if needed + if context_length >= total_size * 0.8: # 80% of content size + if context_length > best_context_length: + best_model = model_info + best_context_length = context_length + elif best_model is None: # Fallback to largest available model + if context_length > best_context_length: + best_model = model_info + best_context_length = context_length + + # Fallback to a reasonable default if no model found + if best_model is None: + best_model = { + "contextLength": 128000, # GPT-4o default + "llmName": "gpt-4o" + } + + # Calculate appropriate sizes + # Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters) + context_length_bytes = int(best_model["contextLength"] * 4) + max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length + text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks + image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks + + logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}") + logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes") + logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes") + + return { + "maxContextBytes": max_context_bytes, + "textChunkSize": text_chunk_size, + "imageChunkSize": image_chunk_size + } def _getModelsForOperation(self, operation_type: str, options: AiCallOptions) -> List[ModelCapabilities]: """ Get models capable of handling the specific operation with capability filtering. """ - # For now, return a default model - this will be enhanced with actual model registry - default_model = ModelCapabilities( - name="default", - maxTokens=4000, - capabilities=["text", "reasoning"] if operation_type == "planning" else ["text"], - costPerToken=0.001, - processingTime=1.0, - isAvailable=True - ) - return [default_model] + # Use the actual AI objects model selection instead of hardcoded default + if hasattr(self, 'aiObjects') and self.aiObjects: + # Let AiObjects handle the model selection + return [] + else: + # Fallback to default model if AiObjects not available + default_model = ModelCapabilities( + name="default", + maxTokens=4000, + capabilities=["text", "reasoning"] if operation_type == "planning" else ["text"], + costPerToken=0.001, + processingTime=1.0, + isAvailable=True + ) + return [default_model] def _buildPromptWithPlaceholders(self, prompt: str, placeholders: Optional[Dict[str, str]]) -> str: """ @@ -430,6 +1230,79 @@ class AiService: return full_prompt + def _writeTraceLog(self, contextText: str, data: Any) -> None: + """Write raw data to the central trace log file without truncation.""" + try: + import os + import json + from datetime import datetime, UTC + # Only write if logger is in debug mode + if logger.level > logging.DEBUG: + return + # Get log directory from configuration via service center if possible + logDir = None + try: + if self.serviceCenter and hasattr(self.serviceCenter, 'utils'): + logDir = self.serviceCenter.utils.configGet("APP_LOGGING_LOG_DIR", "./") + except Exception: + pass + if not logDir: + logDir = "./" + if not os.path.isabs(logDir): + # Make it relative to gateway directory + gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + logDir = os.path.join(gatewayDir, logDir) + os.makedirs(logDir, exist_ok=True) + traceFile = os.path.join(logDir, "log_trace.log") + timestamp = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + traceEntry = f"[{timestamp}] {contextText}\n" + ("=" * 80) + "\n" + if data is None: + traceEntry += "No data provided\n" + else: + # Prefer exact text; if dict/list, pretty print JSON + try: + if isinstance(data, (dict, list)): + traceEntry += f"JSON Data:\n{json.dumps(data, indent=2, ensure_ascii=False)}\n" + else: + text = str(data) + traceEntry += f"Text Data:\n{text}\n" + except Exception: + traceEntry += f"Data (fallback): {str(data)}\n" + traceEntry += ("=" * 80) + "\n\n" + with open(traceFile, "a", encoding="utf-8") as f: + f.write(traceEntry) + except Exception: + # Swallow to avoid recursive logging issues + pass + + def _writeAiResponseDebug(self, label: str, content: str, partIndex: int = 1, modelName: str = None, continuation: bool = None) -> None: + """Persist raw AI response parts for debugging under test-chat/ai.""" + try: + import os + from datetime import datetime, UTC + # Base dir: gateway/test-chat/ai (go up 4 levels from this file) + # .../gateway/modules/services/serviceAi/mainServiceAi.py -> up to gateway root + gatewayDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + outDir = os.path.join(gatewayDir, 'test-chat', 'ai') + os.makedirs(outDir, exist_ok=True) + ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3] + suffix = [] + if partIndex is not None: + suffix.append(f"part{partIndex}") + if continuation is not None: + suffix.append(f"cont_{str(continuation).lower()}") + if modelName: + safeModel = ''.join(c if c.isalnum() or c in ('-', '_') else '-' for c in modelName) + suffix.append(safeModel) + suffixStr = ('_' + '_'.join(suffix)) if suffix else '' + fname = f"{ts}_{label}{suffixStr}.txt" + fpath = os.path.join(outDir, fname) + with open(fpath, 'w', encoding='utf-8') as f: + f.write(content or '') + except Exception: + # Do not raise; best-effort debug write + pass + def _exceedsTokenLimit(self, text: str, model: ModelCapabilities, safety_margin: float) -> bool: """ Check if text exceeds model token limit with safety margin. @@ -445,7 +1318,7 @@ class AiService: placeholders: Optional[Dict[str, str]], model: ModelCapabilities, options: AiCallOptions - ) -> str: + ) -> str: """ Reduce planning prompt size by summarizing placeholders while preserving prompt structure. """ @@ -470,7 +1343,7 @@ class AiService: context: str, model: ModelCapabilities, options: AiCallOptions - ) -> str: + ) -> str: """ Reduce text prompt size using typeGroup-aware chunking and merging. """ @@ -519,3 +1392,111 @@ class AiService: target_length = int(len(text) * reduction_factor) return text[:target_length] + "... [reduced]" + async def _callAiWithDocumentGeneration( + self, + prompt: str, + documents: Optional[List[ChatDocument]], + options: AiCallOptions, + outputFormat: str, + title: Optional[str] + ) -> Dict[str, Any]: + """ + Handle AI calls with document generation in specific output format. + + Args: + prompt: The main prompt for the AI call + documents: Optional list of documents to process + options: AI call configuration options + outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx) + title: Optional title for generated documents + + Returns: + Dict with generated documents and metadata + """ + try: + # Get format-specific extraction prompt from generation service + from modules.services.serviceGeneration.mainServiceGeneration import GenerationService + generation_service = GenerationService(self.serviceCenter) + + # Use default title if not provided + if not title: + title = "AI Generated Document" + + # Get format-specific extraction prompt + extraction_prompt = generation_service.getExtractionPrompt( + output_format=outputFormat, + user_prompt=prompt, + title=title + ) + + # Process documents with format-specific prompt + ai_response = await self._callAiText(extraction_prompt, documents, options) + + # Parse filename header from AI response if present + parsed_filename = None + try: + if ai_response: + first_newline = ai_response.find('\n') + header_line = ai_response if first_newline == -1 else ai_response[:first_newline] + if header_line.strip().lower().startswith('filename:'): + parsed = header_line.split(':', 1)[1].strip() + # basic sanitization + import re + parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", parsed) + parsed = re.sub(r"-+", "-", parsed).strip('-') + if parsed: + parsed_filename = parsed + # remove header line from content for rendering + ai_response = ai_response[first_newline+1:].lstrip('\n') if first_newline != -1 else '' + except Exception: + parsed_filename = None + + if not ai_response or ai_response.strip() == "": + raise Exception("AI content generation failed") + + # Render the content to the specified format + rendered_content, mime_type = await generation_service.renderReport( + extracted_content=ai_response, + output_format=outputFormat, + title=title + ) + + # Generate meaningful filename (use AI-provided if valid, else fallback) + from datetime import datetime, UTC + timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + if parsed_filename and parsed_filename.lower().endswith(f".{outputFormat.lower()}"): + filename = parsed_filename + else: + safe_title = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-') + filename = f"{safe_title or 'document'}-{timestamp}.{outputFormat}" + + # Return structured result with document information + return { + "success": True, + "content": ai_response, # Raw AI response + "rendered_content": rendered_content, # Formatted content + "mime_type": mime_type, + "filename": filename, + "format": outputFormat, + "title": title, + "documents": [{ + "documentName": filename, + "documentData": rendered_content, + "mimeType": mime_type + }] + } + + except Exception as e: + logger.error(f"Error in document generation: {str(e)}") + return { + "success": False, + "error": str(e), + "content": "", + "rendered_content": "", + "mime_type": "text/plain", + "filename": f"error_{outputFormat}", + "format": outputFormat, + "title": title or "Error", + "documents": [] + } + diff --git a/modules/services/serviceExtraction/chunking/image_chunker.py b/modules/services/serviceExtraction/chunking/image_chunker.py new file mode 100644 index 00000000..bf09260a --- /dev/null +++ b/modules/services/serviceExtraction/chunking/image_chunker.py @@ -0,0 +1,182 @@ +from typing import Any, Dict, List +import base64 +import io + +from modules.datamodels.datamodelExtraction import ContentPart +from ..subRegistry import Chunker + + +class ImageChunker(Chunker): + """Chunker for reducing image size through resizing, compression, and tiling.""" + + def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]: + """ + Chunk an image by reducing its size through various strategies. + + Args: + part: ContentPart containing image data (base64 encoded) + options: Chunking options including: + - imageChunkSize: Maximum size in bytes for each chunk + - imageMaxPixels: Maximum pixels (width*height) for the image + - imageQuality: JPEG quality (0-100, default 85) + - imageTileSize: Size for tiling if image is still too large + + Returns: + List of image chunks with reduced size + """ + maxBytes = int(options.get("imageChunkSize", 1000000)) # 1MB default + maxPixels = int(options.get("imageMaxPixels", 1024 * 1024)) # 1MP default + quality = int(options.get("imageQuality", 85)) + tileSize = int(options.get("imageTileSize", 512)) # 512x512 tiles + + chunks: List[Dict[str, Any]] = [] + + try: + # Lazy import PIL to avoid hanging during module import + from PIL import Image + + # Decode base64 image data + imageData = base64.b64decode(part.data) + image = Image.open(io.BytesIO(imageData)) + + # Get original dimensions + originalWidth, originalHeight = image.size + originalPixels = originalWidth * originalHeight + + # Strategy 1: If image is small enough, return as-is + if len(part.data) <= maxBytes and originalPixels <= maxPixels: + chunks.append({ + "data": part.data, + "size": len(part.data), + "order": 0, + "metadata": { + "originalSize": len(part.data), + "originalPixels": originalPixels, + "strategy": "original" + } + }) + return chunks + + # Strategy 2: Resize to fit within pixel limit + if originalPixels > maxPixels: + # Calculate new dimensions maintaining aspect ratio + scale = (maxPixels / originalPixels) ** 0.5 + newWidth = int(originalWidth * scale) + newHeight = int(originalHeight * scale) + + # Ensure minimum size + newWidth = max(newWidth, 64) + newHeight = max(newHeight, 64) + + image = image.resize((newWidth, newHeight), Image.Resampling.LANCZOS) + + # Strategy 3: Compress with quality reduction + currentSize = len(part.data) + currentQuality = quality + + while currentSize > maxBytes and currentQuality > 10: + # Compress image + output = io.BytesIO() + image.save(output, format='JPEG', quality=currentQuality, optimize=True) + compressedData = output.getvalue() + compressedB64 = base64.b64encode(compressedData).decode('utf-8') + currentSize = len(compressedB64) + + if currentSize <= maxBytes: + chunks.append({ + "data": compressedB64, + "size": currentSize, + "order": 0, + "metadata": { + "originalSize": len(part.data), + "originalPixels": originalPixels, + "compressedSize": currentSize, + "quality": currentQuality, + "strategy": "compressed" + } + }) + return chunks + + currentQuality -= 10 + + # Strategy 4: Tile the image if still too large + if currentSize > maxBytes: + chunks = self._tileImage(image, maxBytes, tileSize, quality, originalPixels) + return chunks + + # Fallback: Return compressed version even if over limit + output = io.BytesIO() + image.save(output, format='JPEG', quality=10, optimize=True) + compressedData = output.getvalue() + compressedB64 = base64.b64encode(compressedData).decode('utf-8') + + chunks.append({ + "data": compressedB64, + "size": len(compressedB64), + "order": 0, + "metadata": { + "originalSize": len(part.data), + "originalPixels": originalPixels, + "compressedSize": len(compressedB64), + "quality": 10, + "strategy": "fallback_compressed" + } + }) + + except Exception as e: + # Fallback: Return original data with error metadata + chunks.append({ + "data": part.data, + "size": len(part.data), + "order": 0, + "metadata": { + "originalSize": len(part.data), + "strategy": "error_fallback", + "error": str(e) + } + }) + + return chunks + + def _tileImage(self, image: "Image.Image", maxBytes: int, tileSize: int, quality: int, originalPixels: int) -> List[Dict[str, Any]]: + """Split image into tiles if it's still too large after compression.""" + chunks = [] + width, height = image.size + + # Calculate tile grid + tilesX = (width + tileSize - 1) // tileSize + tilesY = (height + tileSize - 1) // tileSize + + for y in range(tilesY): + for x in range(tilesX): + # Calculate tile boundaries + left = x * tileSize + top = y * tileSize + right = min(left + tileSize, width) + bottom = min(top + tileSize, height) + + # Extract tile + tile = image.crop((left, top, right, bottom)) + + # Compress tile + output = io.BytesIO() + tile.save(output, format='JPEG', quality=quality, optimize=True) + tileData = output.getvalue() + tileB64 = base64.b64encode(tileData).decode('utf-8') + + chunks.append({ + "data": tileB64, + "size": len(tileB64), + "order": y * tilesX + x, + "metadata": { + "originalSize": len(image.tobytes()), + "originalPixels": originalPixels, + "tileSize": tileSize, + "tilePosition": f"{x},{y}", + "tileBounds": f"{left},{top},{right},{bottom}", + "quality": quality, + "strategy": "tiled" + } + }) + + return chunks diff --git a/modules/services/serviceExtraction/chunking/text_chunker.py b/modules/services/serviceExtraction/chunking/text_chunker.py index 35c75168..2c05eeaf 100644 --- a/modules/services/serviceExtraction/chunking/text_chunker.py +++ b/modules/services/serviceExtraction/chunking/text_chunker.py @@ -1,12 +1,17 @@ from typing import Any, Dict, List +import logging from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Chunker +logger = logging.getLogger(__name__) + class TextChunker(Chunker): def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]: maxBytes = int(options.get("textChunkSize", 40000)) + logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}") + logger.debug(f"TextChunker: using maxBytes: {maxBytes}") chunks: List[Dict[str, Any]] = [] current: List[str] = [] size = 0 diff --git a/modules/services/serviceExtraction/formats/binary_extractor.py b/modules/services/serviceExtraction/formats/binary_extractor.py index 1c201c36..e6667fda 100644 --- a/modules/services/serviceExtraction/formats/binary_extractor.py +++ b/modules/services/serviceExtraction/formats/binary_extractor.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List import base64 -from ..utils import makeId +from ..subUtils import makeId from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/csv_extractor.py b/modules/services/serviceExtraction/formats/csv_extractor.py index db3cf969..27233979 100644 --- a/modules/services/serviceExtraction/formats/csv_extractor.py +++ b/modules/services/serviceExtraction/formats/csv_extractor.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/docx_extractor.py b/modules/services/serviceExtraction/formats/docx_extractor.py index 6cb75716..51384ffd 100644 --- a/modules/services/serviceExtraction/formats/docx_extractor.py +++ b/modules/services/serviceExtraction/formats/docx_extractor.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List import io -from ..utils import makeId +from ..subUtils import makeId from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/html_extractor.py b/modules/services/serviceExtraction/formats/html_extractor.py index 6c49c50c..09da02f4 100644 --- a/modules/services/serviceExtraction/formats/html_extractor.py +++ b/modules/services/serviceExtraction/formats/html_extractor.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List from bs4 import BeautifulSoup from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/image_extractor.py b/modules/services/serviceExtraction/formats/image_extractor.py index 296eb50b..22327f50 100644 --- a/modules/services/serviceExtraction/formats/image_extractor.py +++ b/modules/services/serviceExtraction/formats/image_extractor.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List import base64 -from ..utils import makeId +from ..subUtils import makeId from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/json_extractor.py b/modules/services/serviceExtraction/formats/json_extractor.py index 456eb08e..86eac791 100644 --- a/modules/services/serviceExtraction/formats/json_extractor.py +++ b/modules/services/serviceExtraction/formats/json_extractor.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List import json from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/pdf_extractor.py b/modules/services/serviceExtraction/formats/pdf_extractor.py index 4d0d8058..59c88dc7 100644 --- a/modules/services/serviceExtraction/formats/pdf_extractor.py +++ b/modules/services/serviceExtraction/formats/pdf_extractor.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List import base64 import io -from ..utils import makeId +from ..subUtils import makeId from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/text_extractor.py b/modules/services/serviceExtraction/formats/text_extractor.py index 5099d04c..a6d92bc1 100644 --- a/modules/services/serviceExtraction/formats/text_extractor.py +++ b/modules/services/serviceExtraction/formats/text_extractor.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/formats/xlsx_extractor.py b/modules/services/serviceExtraction/formats/xlsx_extractor.py index 577b0776..ea6396a2 100644 --- a/modules/services/serviceExtraction/formats/xlsx_extractor.py +++ b/modules/services/serviceExtraction/formats/xlsx_extractor.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List import io from datetime import datetime -from ..utils import makeId +from ..subUtils import makeId from modules.datamodels.datamodelExtraction import ContentPart from ..subRegistry import Extractor @@ -75,7 +75,8 @@ class XlsxExtractor(Extractor): elif isinstance(v, datetime): cells.append(v.strftime("%Y-%m-%d %H:%M:%S")) else: - cells.append(f'"{str(v).replace("\"", "\"\"")}"') + escaped_value = str(v).replace('"', '""') + cells.append(f'"{escaped_value}"') lines.append(",".join(cells)) csvData = "\n".join(lines) parts.append(ContentPart( diff --git a/modules/services/serviceExtraction/formats/xml_extractor.py b/modules/services/serviceExtraction/formats/xml_extractor.py index 7067924b..5aabea35 100644 --- a/modules/services/serviceExtraction/formats/xml_extractor.py +++ b/modules/services/serviceExtraction/formats/xml_extractor.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List import xml.etree.ElementTree as ET from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId from ..subRegistry import Extractor diff --git a/modules/services/serviceExtraction/mainServiceExtraction.py b/modules/services/serviceExtraction/mainServiceExtraction.py index 8ec970e8..6d313463 100644 --- a/modules/services/serviceExtraction/mainServiceExtraction.py +++ b/modules/services/serviceExtraction/mainServiceExtraction.py @@ -1,9 +1,14 @@ from typing import Any, Dict, List, Optional, Union import uuid +import logging from .subRegistry import ExtractorRegistry, ChunkerRegistry from .subPipeline import runExtraction, poolAndLimit, applyAiIfRequested -from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart +from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy +from modules.datamodels.datamodelChat import ChatDocument + + +logger = logging.getLogger(__name__) class ExtractionService: @@ -12,45 +17,325 @@ class ExtractionService: self._extractorRegistry = ExtractorRegistry() self._chunkerRegistry = ChunkerRegistry() - def extractContent(self, documentList: List[Dict[str, Any]], options: Dict[str, Any]) -> List[ExtractedContent]: - results: List[ExtractedContent] = [] - for doc in documentList: + def extractContent(self, documents: List[ChatDocument], options: Dict[str, Any]) -> List[ContentExtracted]: + """ + Extract content from a list of ChatDocument objects. + + Args: + documents: List of ChatDocument objects to extract content from + options: Extraction options including maxSize, chunkAllowed, mergeStrategy, etc. + + Returns: + List of ContentExtracted objects, one per input document + """ + results: List[ContentExtracted] = [] + + # Lazy import to avoid circular deps and heavy init at module import + from modules.interfaces.interfaceDbComponentObjects import getInterface + dbInterface = getInterface() + + for i, doc in enumerate(documents): + logger.info(f"=== DOCUMENT {i}: {doc.fileName} ===") + logger.info(f"Initial MIME type: {doc.mimeType}") + + # Resolve raw bytes for this document using interface + documentBytes = dbInterface.getFileData(doc.fileId) + if not documentBytes: + raise ValueError(f"No file data found for fileId={doc.fileId}") + + # Convert ChatDocument to the format expected by runExtraction + documentData = { + "id": doc.id, + "bytes": documentBytes, + "fileName": doc.fileName, + "mimeType": doc.mimeType + } + ec = runExtraction( extractorRegistry=self._extractorRegistry, chunkerRegistry=self._chunkerRegistry, - documentBytes=doc.get("bytes"), - fileName=doc.get("fileName"), - mimeType=doc.get("mimeType"), + documentBytes=documentData["bytes"], + fileName=documentData["fileName"], + mimeType=documentData["mimeType"], options=options ) + + # Log content parts metadata + logger.debug(f"Content parts: {len(ec.parts)}") + for j, part in enumerate(ec.parts): + logger.debug(f" Part {j}: {part.typeGroup} ({part.mimeType}) - {len(part.data) if part.data else 0} chars") + if part.metadata: + logger.debug(f" Metadata: {part.metadata}") + # Attach document id to parts if missing for p in ec.parts: if "documentId" not in p.metadata: - p.metadata["documentId"] = doc.get("id") or str(uuid.uuid4()) + p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4()) + + # Log chunking information + chunked_parts = [p for p in ec.parts if p.metadata.get("chunk", False)] + if chunked_parts: + logger.debug(f"=== CHUNKING RESULTS ===") + logger.debug(f"Total parts: {len(ec.parts)}") + logger.debug(f"Chunked parts: {len(chunked_parts)}") + for chunk in chunked_parts: + logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})") + else: + logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits") + ec = applyAiIfRequested(ec, options) results.append(ec) + return results - async def extractContentFromDocument(self, prompt: str, documents: List[Dict[str, Any]], options: Optional[Dict[str, Any]] = None) -> List[ExtractedContent]: + def mergeAiResults( + self, + extractedContent: List[ContentExtracted], + aiResults: List[str], + strategy: MergeStrategy + ) -> ContentExtracted: """ - Batch extract content from multiple documents. + Merge AI results from chunked content back into a single ContentExtracted. Args: - prompt: Instructional prompt for optional AI post-processing/selection. - documents: List of dicts with keys: id, bytes, fileName, mimeType. - options: Optional extraction options. "ai" config may be provided. + extractedContent: List of ContentExtracted objects that were processed + aiResults: List of AI response strings, one per chunk + strategy: Merge strategy configuration (dict or MergeStrategy object) Returns: - List[ExtractedContent]: one per input document in order. + Single ContentExtracted with merged AI results """ - # Build options safely and inject prompt for downstream AI selection if desired - effectiveOptions: Dict[str, Any] = options.copy() if options else {} - aiCfg = effectiveOptions.get("ai") or {} - if prompt: - aiCfg["prompt"] = prompt - effectiveOptions["ai"] = aiCfg - - # Delegate to existing synchronous pipeline - return self.extractContent(documents, effectiveOptions) + logger.debug(f"=== MERGING AI RESULTS ===") + logger.debug(f"Extracted content: {len(extractedContent)} documents") + logger.debug(f"AI results: {len(aiResults)} responses") + logger.debug(f"Merge strategy: {strategy.mergeType}") + + mergeStrategy = strategy + + # Collect all parts from all extracted content + allParts: List[ContentPart] = [] + for ec in extractedContent: + allParts.extend(ec.parts) + + logger.debug(f"Total original parts: {len(allParts)}") + + # Create AI result parts + aiResultParts: List[ContentPart] = [] + for i, aiResult in enumerate(aiResults): + aiPart = ContentPart( + id=f"ai_result_{i}", + parentId=None, # Will be set based on strategy + label="ai_result", + typeGroup="text", + mimeType="text/plain", + data=aiResult, + metadata={ + "aiResult": True, + "order": i, + "size": len(aiResult.encode('utf-8')) + } + ) + aiResultParts.append(aiPart) + + logger.debug(f"Created {len(aiResultParts)} AI result parts") + + # Apply merging strategy + if mergeStrategy.mergeType == "concatenate": + mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy) + elif mergeStrategy.mergeType == "hierarchical": + mergedParts = self._mergeHierarchical(allParts, aiResultParts, mergeStrategy) + elif mergeStrategy.mergeType == "intelligent": + mergedParts = self._mergeIntelligent(allParts, aiResultParts, mergeStrategy) + else: + # Default to concatenate + mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy) + + # Create final ContentExtracted + mergedContent = ContentExtracted( + id=f"merged_{uuid.uuid4()}", + parts=mergedParts + ) + + logger.debug(f"=== MERGE COMPLETED ===") + logger.debug(f"Final merged parts: {len(mergedParts)}") + logger.debug(f"Merged content ID: {mergedContent.id}") + + return mergedContent + + def _mergeConcatenate( + self, + originalParts: List[ContentPart], + aiResultParts: List[ContentPart], + strategy: MergeStrategy + ) -> List[ContentPart]: + """Merge parts by simple concatenation.""" + mergedParts = [] + + # Add original parts (filtered if needed) + for part in originalParts: + if strategy.preserveChunks or not part.metadata.get("chunk", False): + mergedParts.append(part) + + # Add AI results + if aiResultParts: + # Group AI results by parentId if available + aiResultsByParent = {} + for aiPart in aiResultParts: + parentId = aiPart.parentId or "root" + if parentId not in aiResultsByParent: + aiResultsByParent[parentId] = [] + aiResultsByParent[parentId].append(aiPart) + + # Merge AI results for each parent + for parentId, aiParts in aiResultsByParent.items(): + if len(aiParts) == 1: + mergedParts.append(aiParts[0]) + else: + # Concatenate multiple AI results for same parent + combinedData = strategy.chunkSeparator.join([p.data for p in aiParts]) + combinedPart = ContentPart( + id=f"merged_ai_{parentId}", + parentId=parentId if parentId != "root" else None, + label="merged_ai_result", + typeGroup="text", + mimeType="text/plain", + data=combinedData, + metadata={ + "aiResult": True, + "merged": True, + "sourceCount": len(aiParts), + "size": len(combinedData.encode('utf-8')) + } + ) + mergedParts.append(combinedPart) + + return mergedParts + + def _mergeHierarchical( + self, + originalParts: List[ContentPart], + aiResultParts: List[ContentPart], + strategy: MergeStrategy + ) -> List[ContentPart]: + """Merge parts hierarchically based on parentId relationships.""" + # Group parts by parentId + partsByParent = {} + for part in originalParts: + parentId = part.parentId or "root" + if parentId not in partsByParent: + partsByParent[parentId] = [] + partsByParent[parentId].append(part) + + # Group AI results by parentId + aiResultsByParent = {} + for aiPart in aiResultParts: + parentId = aiPart.parentId or "root" + if parentId not in aiResultsByParent: + aiResultsByParent[parentId] = [] + aiResultsByParent[parentId].append(aiPart) + + mergedParts = [] + + # Process each parent group + for parentId in set(list(partsByParent.keys()) + list(aiResultsByParent.keys())): + originalGroup = partsByParent.get(parentId, []) + aiGroup = aiResultsByParent.get(parentId, []) + + # Add original parts + mergedParts.extend(originalGroup) + + # Add AI results for this parent + if aiGroup: + if len(aiGroup) == 1: + mergedParts.append(aiGroup[0]) + else: + # Merge multiple AI results + combinedData = strategy.chunkSeparator.join([p.data for p in aiGroup]) + combinedPart = ContentPart( + id=f"hierarchical_ai_{parentId}", + parentId=parentId if parentId != "root" else None, + label="hierarchical_ai_result", + typeGroup="text", + mimeType="text/plain", + data=combinedData, + metadata={ + "aiResult": True, + "hierarchical": True, + "sourceCount": len(aiGroup), + "size": len(combinedData.encode('utf-8')) + } + ) + mergedParts.append(combinedPart) + + return mergedParts + + def _mergeIntelligent( + self, + originalParts: List[ContentPart], + aiResultParts: List[ContentPart], + strategy: MergeStrategy + ) -> List[ContentPart]: + """Merge parts using intelligent strategies based on content type.""" + mergedParts = [] + + # Group by typeGroup for intelligent merging + partsByType = {} + for part in originalParts: + typeGroup = part.typeGroup + if typeGroup not in partsByType: + partsByType[typeGroup] = [] + partsByType[typeGroup].append(part) + + # Process each type group + for typeGroup, parts in partsByType.items(): + if typeGroup == "text": + mergedParts.extend(self._mergeTextIntelligent(parts, aiResultParts, strategy)) + elif typeGroup == "table": + mergedParts.extend(self._mergeTableIntelligent(parts, aiResultParts, strategy)) + elif typeGroup == "structure": + mergedParts.extend(self._mergeStructureIntelligent(parts, aiResultParts, strategy)) + else: + # Default handling for other types + mergedParts.extend(parts) + + # Add any remaining AI results that weren't merged + for aiPart in aiResultParts: + if not any(p.id == aiPart.id for p in mergedParts): + mergedParts.append(aiPart) + + return mergedParts + + def _mergeTextIntelligent( + self, + textParts: List[ContentPart], + aiResultParts: List[ContentPart], + strategy: MergeStrategy + ) -> List[ContentPart]: + """Intelligent merging for text content.""" + # For now, use concatenate strategy + # This could be enhanced with semantic analysis, summarization, etc. + return self._mergeConcatenate(textParts, aiResultParts, strategy) + + def _mergeTableIntelligent( + self, + tableParts: List[ContentPart], + aiResultParts: List[ContentPart], + strategy: MergeStrategy + ) -> List[ContentPart]: + """Intelligent merging for table content.""" + # For now, use concatenate strategy + # This could be enhanced with table merging logic + return self._mergeConcatenate(tableParts, aiResultParts, strategy) + + def _mergeStructureIntelligent( + self, + structureParts: List[ContentPart], + aiResultParts: List[ContentPart], + strategy: MergeStrategy + ) -> List[ContentPart]: + """Intelligent merging for structured content.""" + # For now, use concatenate strategy + # This could be enhanced with structure-aware merging + return self._mergeConcatenate(structureParts, aiResultParts, strategy) diff --git a/modules/services/serviceExtraction/merging/table_merger.py b/modules/services/serviceExtraction/merging/table_merger.py index 04be404e..4f62358c 100644 --- a/modules/services/serviceExtraction/merging/table_merger.py +++ b/modules/services/serviceExtraction/merging/table_merger.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId class TableMerger: diff --git a/modules/services/serviceExtraction/merging/text_merger.py b/modules/services/serviceExtraction/merging/text_merger.py index bb9e850d..38f7c6f0 100644 --- a/modules/services/serviceExtraction/merging/text_merger.py +++ b/modules/services/serviceExtraction/merging/text_merger.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List from modules.datamodels.datamodelExtraction import ContentPart -from ..utils import makeId +from ..subUtils import makeId class TextMerger: diff --git a/modules/services/serviceExtraction/subPipeline.py b/modules/services/serviceExtraction/subPipeline.py index 4441e9c4..fd7eb20c 100644 --- a/modules/services/serviceExtraction/subPipeline.py +++ b/modules/services/serviceExtraction/subPipeline.py @@ -1,14 +1,61 @@ from typing import Any, Dict, List +import logging +import os -from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart -from .utils import makeId +from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart +from .subUtils import makeId from .subRegistry import ExtractorRegistry, ChunkerRegistry from .merging.text_merger import TextMerger from .merging.table_merger import TableMerger from .merging.default_merger import DefaultMerger +logger = logging.getLogger(__name__) -def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: Dict[str, Any]) -> ExtractedContent: + +def _mergeParts(parts: List[ContentPart], mergeStrategy: Dict[str, Any]) -> List[ContentPart]: + """Merge parts based on the provided strategy.""" + if not parts or not mergeStrategy: + return parts + + groupBy = mergeStrategy.get("groupBy", "typeGroup") + orderBy = mergeStrategy.get("orderBy", "id") + + # Group parts by the specified field + groups = {} + for part in parts: + key = getattr(part, groupBy, "unknown") + if key not in groups: + groups[key] = [] + groups[key].append(part) + + # Merge each group + merged_parts = [] + for group_key, group_parts in groups.items(): + if len(group_parts) == 1: + merged_parts.extend(group_parts) + else: + # Sort by orderBy field if specified + if orderBy: + group_parts.sort(key=lambda p: getattr(p, orderBy, "")) + + # Use appropriate merger based on type + type_group = group_parts[0].typeGroup if group_parts else "unknown" + + if type_group == "text": + merger = TextMerger() + elif type_group == "table": + merger = TableMerger() + else: + merger = DefaultMerger() + + # Merge the group + merged = merger.merge(group_parts, mergeStrategy) + merged_parts.extend(merged) + + return merged_parts + + +def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: Dict[str, Any]) -> ContentExtracted: extractor = extractorRegistry.resolve(mimeType, fileName) if extractor is None: # fallback: single binary part @@ -21,14 +68,66 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker data="", metadata={"warning": "No extractor registered"} ) - return ExtractedContent(id=makeId(), parts=[part]) + return ContentExtracted(id=makeId(), parts=[part]) parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType, "options": options}) - # Optional merge step + + # Apply chunking and size limiting + parts = poolAndLimit(parts, chunkerRegistry, options) + + # Optional merge step - but preserve chunks mergeStrategy = options.get("mergeStrategy", {}) if mergeStrategy: - parts = _mergeParts(parts, mergeStrategy) - return ExtractedContent(id=makeId(), parts=parts) + + # Don't merge chunks - they should stay separate for processing + non_chunk_parts = [p for p in parts if not p.metadata.get("chunk", False)] + chunk_parts = [p for p in parts if p.metadata.get("chunk", False)] + + logger.debug(f"runExtraction: Preserving {len(chunk_parts)} chunks from merging") + + if non_chunk_parts: + non_chunk_parts = _mergeParts(non_chunk_parts, mergeStrategy) + + # Combine non-chunk parts with chunk parts (chunks stay separate) + parts = non_chunk_parts + chunk_parts + + logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})") + # DEBUG: dump parts and chunks to files TODO TO REMOVE + try: + base_dir = "./test-chat/ai" + os.makedirs(base_dir, exist_ok=True) + + # Generate timestamp for consistent naming + from datetime import datetime, UTC + ts = datetime.now(UTC).strftime('%Y%m%d-%H%M%S-%f')[:-3] + + # Write a summary file + summary_lines: List[str] = [f"fileName: {fileName}", f"mimeType: {mimeType}", f"totalParts: {len(parts)}"] + text_index = 0 + for idx, part in enumerate(parts): + is_texty = part.typeGroup in ("text", "table", "structure") + size = int(part.metadata.get("size", 0) or 0) + is_chunk = bool(part.metadata.get("chunk", False)) + summary_lines.append( + f"part[{idx}]: typeGroup={part.typeGroup}, label={part.label}, size={size}, chunk={is_chunk}" + ) + if is_texty and getattr(part, "data", None): + text_index += 1 + fname = f"{ts}_extract_{fileName}_part_{idx:03d}_{'chunk' if is_chunk else 'full'}_{text_index:03d}.txt" + fpath = os.path.join(base_dir, fname) + with open(fpath, "w", encoding="utf-8") as f: + f.write(f"# typeGroup: {part.typeGroup}\n# label: {part.label}\n# chunk: {is_chunk}\n# size: {size}\n\n") + f.write(str(part.data)) + + # Write summary file + summary_fname = f"{ts}_extract_{fileName}_summary.txt" + summary_fpath = os.path.join(base_dir, summary_fname) + with open(summary_fpath, "w", encoding="utf-8") as f: + f.write("\n".join(summary_lines)) + except Exception as _e: + logger.debug(f"Debug dump skipped: {_e}") + + return ContentExtracted(id=makeId(), parts=parts) def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, options: Dict[str, Any]) -> List[ContentPart]: @@ -57,28 +156,54 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt # If we have remaining parts and chunking is allowed, try chunking if remaining and chunkAllowed: + logger.debug(f"=== CHUNKING ACTIVATED ===") + logger.debug(f"Remaining parts to chunk: {len(remaining)}") + logger.debug(f"Max size limit: {maxSize} bytes") + logger.debug(f"Current size used: {current} bytes") + for p in remaining: - if p.typeGroup in ("text", "table", "structure"): + if p.typeGroup in ("text", "table", "structure", "image"): + logger.debug(f"Chunking {p.typeGroup} part: {len(p.data)} chars") chunks = chunkerRegistry.resolve(p.typeGroup).chunk(p, options) + logger.debug(f"Created {len(chunks)} chunks") + + chunks_added = 0 for ch in chunks: chSize = int(ch.get("size", 0) or 0) - if current + chSize <= maxSize: - kept.append(ContentPart( - id=makeId(), - parentId=p.id, - label=f"chunk_{ch.get('order', 0)}", - typeGroup=p.typeGroup, - mimeType=p.mimeType, - data=ch.get("data", ""), - metadata={"size": chSize, "chunk": True} - )) - current += chSize - else: - break + # Add all chunks - don't limit by maxSize since they'll be processed separately + kept.append(ContentPart( + id=makeId(), + parentId=p.id, + label=f"chunk_{ch.get('order', 0)}", + typeGroup=p.typeGroup, + mimeType=p.mimeType, + data=ch.get("data", ""), + metadata={ + "size": chSize, + "chunk": True, + **ch.get("metadata", {}) + } + )) + chunks_added += 1 + logger.debug(f"Added chunk {ch.get('order', 0)}: {chSize} bytes") + + logger.debug(f"Added {chunks_added} chunks from {p.typeGroup} part") - # Apply merging strategy if provided + # Apply merging strategy if provided, but preserve chunks if mergeStrategy: - kept = _applyMerging(kept, mergeStrategy) + # Don't merge chunks - they should stay separate for processing + non_chunk_parts = [p for p in kept if not p.metadata.get("chunk", False)] + chunk_parts = [p for p in kept if p.metadata.get("chunk", False)] + + logger.debug(f"Preserving {len(chunk_parts)} chunks from merging") + + if non_chunk_parts: + non_chunk_parts = _applyMerging(non_chunk_parts, mergeStrategy) + + # Combine non-chunk parts with chunk parts (chunks stay separate) + kept = non_chunk_parts + chunk_parts + + logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})") # Re-check size after merging totalSize = sum(int(p.metadata.get("size", 0) or 0) for p in kept) @@ -151,7 +276,7 @@ def _applySizeLimit(parts: List[ContentPart], maxSize: int) -> List[ContentPart] return kept -def applyAiIfRequested(extracted: ExtractedContent, options: Dict[str, Any]) -> ExtractedContent: +def applyAiIfRequested(extracted: ContentExtracted, options: Dict[str, Any]) -> ContentExtracted: """ Apply AI processing if requested in options. This is a placeholder for actual AI integration. diff --git a/modules/services/serviceExtraction/subRegistry.py b/modules/services/serviceExtraction/subRegistry.py index 9c0adab5..07a978d4 100644 --- a/modules/services/serviceExtraction/subRegistry.py +++ b/modules/services/serviceExtraction/subRegistry.py @@ -59,8 +59,11 @@ class ExtractorRegistry: self.register("xlsm", XlsxExtractor()) # fallback self.setFallback(BinaryExtractor()) - except Exception: - pass + print(f"✅ ExtractorRegistry: Successfully registered {len(self._map)} extractors") + except Exception as e: + print(f"❌ ExtractorRegistry: Failed to register extractors: {str(e)}") + import traceback + traceback.print_exc() def register(self, key: str, extractor: Extractor): self._map[key] = extractor @@ -88,11 +91,16 @@ class ChunkerRegistry: from .chunking.text_chunker import TextChunker from .chunking.table_chunker import TableChunker from .chunking.structure_chunker import StructureChunker + # Skip ImageChunker for now to avoid PIL import hang + # from .chunking.image_chunker import ImageChunker self.register("text", TextChunker()) self.register("table", TableChunker()) self.register("structure", StructureChunker()) - except Exception: - pass + # self.register("image", ImageChunker()) + except Exception as e: + print(f"❌ ChunkerRegistry: Failed to register chunkers: {str(e)}") + import traceback + traceback.print_exc() def register(self, typeGroup: str, chunker: Chunker): self._map[typeGroup] = chunker diff --git a/modules/services/serviceExtraction/utils/__init__.py b/modules/services/serviceExtraction/subUtils.py similarity index 96% rename from modules/services/serviceExtraction/utils/__init__.py rename to modules/services/serviceExtraction/subUtils.py index a16d3f59..efee532b 100644 --- a/modules/services/serviceExtraction/utils/__init__.py +++ b/modules/services/serviceExtraction/subUtils.py @@ -3,5 +3,3 @@ import uuid def makeId() -> str: return str(uuid.uuid4()) - - diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py index f18f071b..72301768 100644 --- a/modules/services/serviceGeneration/mainServiceGeneration.py +++ b/modules/services/serviceGeneration/mainServiceGeneration.py @@ -105,12 +105,49 @@ class GenerationService: logger.info(f"Document {document_name} has content: {len(content)} characters") + # Normalize file extension based on mime type if missing or incorrect + try: + mime_to_ext = { + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", + "application/pdf": ".pdf", + "text/html": ".html", + "text/markdown": ".md", + "text/plain": ".txt", + "application/json": ".json", + } + expected_ext = mime_to_ext.get(mime_type) + if expected_ext: + if not document_name.lower().endswith(expected_ext): + # Append/replace extension to match mime type + if "." in document_name: + document_name = document_name.rsplit(".", 1)[0] + expected_ext + else: + document_name = document_name + expected_ext + except Exception: + pass + + # Decide if content is base64-encoded binary (e.g., docx/pdf) or plain text + base64encoded = False + try: + binary_mime_types = { + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/pdf", + } + if isinstance(document_data, str) and mime_type in binary_mime_types: + base64encoded = True + except Exception: + base64encoded = False + # Create document with file in one step using interfaces directly document = self._createDocument( fileName=document_name, mimeType=mime_type, content=content, - base64encoded=False, + base64encoded=base64encoded, messageId=message_id ) if document: @@ -257,4 +294,109 @@ class GenerationService: 'totalActions': 0, 'workflowStatus': 'unknown', 'workflowId': 'unknown' - } \ No newline at end of file + } + + async def renderReport(self, extracted_content: str, output_format: str, title: str) -> tuple[str, str]: + """ + Render extracted content to the specified output format. + + Args: + extracted_content: Content extracted by AI using format-specific prompt + output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx) + title: Report title + + Returns: + tuple: (rendered_content, mime_type) + """ + try: + # DEBUG: dump renderer input to diagnose JSON+HTML mixtures TODO REMOVE + try: + import os + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_root = "./test-chat/ai" + debug_dir = os.path.join(debug_root, f"render_input_{ts}") + os.makedirs(debug_dir, exist_ok=True) + with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f: + f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n") + with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f: + f.write(extracted_content or "") + except Exception: + pass + + # Get the appropriate renderer for the format + renderer = self._getFormatRenderer(output_format) + if not renderer: + raise ValueError(f"Unsupported output format: {output_format}") + + # Render the content + rendered_content, mime_type = await renderer.render(extracted_content, title) + # DEBUG: dump rendered output + try: + import os + with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f: + f.write(rendered_content or "") + except Exception: + pass + + logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters") + return rendered_content, mime_type + + except Exception as e: + logger.error(f"Error rendering report to {output_format}: {str(e)}") + raise + + def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str: + """ + Get the format-specific extraction prompt for AI content extraction. + + Args: + output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx) + user_prompt: User's original prompt for report generation + title: Report title + + Returns: + str: Format-specific prompt for AI extraction + """ + try: + # Get the appropriate renderer for the format + renderer = self._getFormatRenderer(output_format) + if not renderer: + raise ValueError(f"Unsupported output format: {output_format}") + + # Build centralized prompt with generic rules + format-specific guidelines + from .prompt_builder import buildExtractionPrompt + extraction_prompt = buildExtractionPrompt( + output_format=output_format, + renderer=renderer, + user_prompt=user_prompt, + title=title + ) + + logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters") + return extraction_prompt + + except Exception as e: + logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}") + raise + + def _getFormatRenderer(self, output_format: str): + """Get the appropriate renderer for the specified format using auto-discovery.""" + try: + from .renderers.registry import get_renderer + renderer = get_renderer(output_format) + + if renderer: + return renderer + + # Fallback to text renderer if no specific renderer found + logger.warning(f"No renderer found for format {output_format}, falling back to text") + fallback_renderer = get_renderer('text') + if fallback_renderer: + return fallback_renderer + + logger.error("Even text renderer fallback failed") + return None + + except Exception as e: + logger.error(f"Error getting renderer for {output_format}: {str(e)}") + return None \ No newline at end of file diff --git a/modules/services/serviceGeneration/prompt_builder.py b/modules/services/serviceGeneration/prompt_builder.py new file mode 100644 index 00000000..208c4c18 --- /dev/null +++ b/modules/services/serviceGeneration/prompt_builder.py @@ -0,0 +1,72 @@ +""" +Centralized prompt builder for document generation across formats. + +Builds a robust prompt that: +- Accepts any user intent (no fixed structure assumptions) +- Injects format-specific guidelines from the selected renderer +- Adds a common policy section to always use real data from source docs +- Requires the AI to output a filename header that we can parse and use +""" + +from typing import Protocol + + +class _RendererLike(Protocol): + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: # returns only format-specific guidelines + ... + + +def buildExtractionPrompt( + output_format: str, + renderer: _RendererLike, + user_prompt: str, + title: str +) -> str: + """ + Build the final extraction prompt by combining: + - The raw user prompt (verbatim) + - Generic cross-format instructions (filename header + real-data policy) + - Format-specific guidelines snippet provided by the renderer + + The AI must place a single filename header at the very top: + FILENAME: + followed by a blank line and then ONLY the document content according to the target format. + """ + + format_guidelines = renderer.getExtractionPrompt(user_prompt, title) + + # Generic block appears once for every format + generic_intro = f""" +{user_prompt} + +You are generating a document in {output_format.upper()} format for the title: "{title}". + +Rules: +- The user's intent fully defines the structure. Do not assume a fixed template or headings. +- Use only factual information extracted from the supplied source documents. +- Do not invent, hallucinate, or include placeholders (e.g., "lorem ipsum", "TBD"). +- The output must strictly follow the target format and be ready for saving without extra wrapping. +- At the VERY TOP output exactly one line with the filename header: + FILENAME: + - The base name should be short, descriptive, and kebab-case or snake-case without spaces. + - Include the correct extension for the requested format (e.g., .html, .pdf, .docx, .md, .txt, .json, .csv, .xlsx). + - Avoid special characters beyond [a-zA-Z0-9-_]. + - After this header, insert a single blank line and then provide ONLY the document content. + +Common policy: +- Use the actual data from the source documents to create the content. +- Do not generate placeholder text or templates. +- Extract and use the real data provided in the source documents to create meaningful content. +""".strip() + + # Final assembly + final_prompt = ( + generic_intro + + "\n\nFORMAT-SPECIFIC GUIDELINES:\n" + + format_guidelines.strip() + + "\n\nGenerate the complete document content now based on the source documents below:" + ) + + return final_prompt + + diff --git a/modules/services/serviceGeneration/renderers/base_renderer.py b/modules/services/serviceGeneration/renderers/base_renderer.py new file mode 100644 index 00000000..dd91be09 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/base_renderer.py @@ -0,0 +1,86 @@ +""" +Base renderer class for all format renderers. +""" + +from abc import ABC, abstractmethod +from typing import Dict, Any, Tuple, List +import logging + +logger = logging.getLogger(__name__) + +class BaseRenderer(ABC): + """Base class for all format renderers.""" + + def __init__(self): + self.logger = logger + + @classmethod + def get_supported_formats(cls) -> List[str]: + """ + Return list of supported format names for this renderer. + Override this method in subclasses to specify supported formats. + """ + return [] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """ + Return list of format aliases for this renderer. + Override this method in subclasses to specify format aliases. + """ + return [] + + @classmethod + def get_priority(cls) -> int: + """ + Return priority for this renderer (higher number = higher priority). + Used when multiple renderers support the same format. + """ + return 0 + + @abstractmethod + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """ + Get the format-specific extraction prompt for AI content extraction. + + Args: + user_prompt: User's original prompt for report generation + title: Report title + + Returns: + str: Format-specific prompt for AI extraction + """ + pass + + @abstractmethod + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """ + Render extracted content to the target format. + + Args: + extracted_content: Raw content extracted by AI using format-specific prompt + title: Report title + + Returns: + tuple: (rendered_content, mime_type) + """ + pass + + def _extract_sections(self, report_data: Dict[str, Any]) -> list: + """Extract sections from report data.""" + return report_data.get('sections', []) + + def _extract_metadata(self, report_data: Dict[str, Any]) -> Dict[str, Any]: + """Extract metadata from report data.""" + return report_data.get('metadata', {}) + + def _get_title(self, report_data: Dict[str, Any], fallback_title: str) -> str: + """Get title from report data or use fallback.""" + return report_data.get('title', fallback_title) + + def _format_timestamp(self, timestamp: str = None) -> str: + """Format timestamp for display.""" + if timestamp: + return timestamp + from datetime import datetime, UTC + return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC") diff --git a/modules/services/serviceGeneration/renderers/csv_renderer.py b/modules/services/serviceGeneration/renderers/csv_renderer.py new file mode 100644 index 00000000..9ef6882c --- /dev/null +++ b/modules/services/serviceGeneration/renderers/csv_renderer.py @@ -0,0 +1,64 @@ +""" +CSV renderer for report generation. +""" + +from .base_renderer import BaseRenderer +from typing import Dict, Any, Tuple, List +import csv +import io + +class CsvRenderer(BaseRenderer): + """Renders content to CSV format with format-specific extraction.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported CSV formats.""" + return ['csv'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['spreadsheet', 'table'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for CSV renderer.""" + return 70 + + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """Return only CSV-specific guidelines; global prompt is built centrally.""" + return ( + "CSV FORMAT GUIDELINES:\n" + "- Emit ONLY CSV text without fences or commentary.\n" + "- Include a single header row with clear column names.\n" + "- Quote fields containing commas, quotes, or newlines; escape quotes by doubling them.\n" + "- Use rows to represent items/records derived from sources.\n" + "- Keep cells concise; include units in headers when useful.\n" + "OUTPUT: Return ONLY valid CSV content that can be imported." + ) + + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """Render extracted content to CSV format.""" + try: + # The extracted content should already be CSV from the AI + # Just clean it up + csv_content = self._clean_csv_content(extracted_content, title) + + return csv_content, "text/csv" + + except Exception as e: + self.logger.error(f"Error rendering CSV: {str(e)}") + # Return minimal CSV fallback + return f"Title,Content\n{title},Error rendering report: {str(e)}", "text/csv" + + def _clean_csv_content(self, content: str, title: str) -> str: + """Clean and validate CSV content from AI.""" + content = content.strip() + + # Remove markdown code blocks if present + if content.startswith("```") and content.endswith("```"): + lines = content.split('\n') + if len(lines) > 2: + content = '\n'.join(lines[1:-1]).strip() + + return content diff --git a/modules/services/serviceGeneration/renderers/docx_renderer.py b/modules/services/serviceGeneration/renderers/docx_renderer.py new file mode 100644 index 00000000..134f00cd --- /dev/null +++ b/modules/services/serviceGeneration/renderers/docx_renderer.py @@ -0,0 +1,249 @@ +""" +DOCX renderer for report generation using python-docx. +""" + +from .base_renderer import BaseRenderer +from typing import Dict, Any, Tuple, List +import io +import base64 +from datetime import datetime, UTC + +try: + from docx import Document + from docx.shared import Inches, Pt + from docx.enum.text import WD_ALIGN_PARAGRAPH + from docx.enum.table import WD_TABLE_ALIGNMENT + from docx.oxml.shared import OxmlElement, qn + from docx.oxml.ns import nsdecls + from docx.oxml import parse_xml + DOCX_AVAILABLE = True +except ImportError: + DOCX_AVAILABLE = False + +class DocxRenderer(BaseRenderer): + """Renders content to DOCX format using python-docx.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported DOCX formats.""" + return ['docx', 'doc'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['word', 'document'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for DOCX renderer.""" + return 115 + + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """Return only DOCX-specific guidelines; global prompt is built centrally.""" + return ( + "DOCX FORMAT GUIDELINES:\n" + "- Provide plain text content suitable for Word generation (no markdown/HTML).\n" + "- Use clear section hierarchy; bullet and numbered lists where needed.\n" + "- Include tables as simple pipe-delimited lines if tabular data is needed.\n" + "OUTPUT: Return ONLY the structured plain text to be converted into DOCX." + ) + + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """Render extracted content to DOCX format.""" + try: + if not DOCX_AVAILABLE: + # Fallback to HTML if python-docx not available + from .html_renderer import HtmlRenderer + html_renderer = HtmlRenderer() + html_content, _ = await html_renderer.render(extracted_content, title) + return html_content, "text/html" + + # Generate DOCX using python-docx + docx_content = self._generate_docx(extracted_content, title) + + return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + + except Exception as e: + self.logger.error(f"Error rendering DOCX: {str(e)}") + # Return minimal fallback + return f"DOCX Generation Error: {str(e)}", "text/plain" + + def _generate_docx(self, content: str, title: str) -> str: + """Generate DOCX content using python-docx.""" + try: + # Create new document + doc = Document() + + # Set up document styles + self._setup_document_styles(doc) + + # Add title + title_para = doc.add_heading(title, 0) + title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER + + # Add generation date + date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}") + date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER + + # Add page break + doc.add_page_break() + + # Process content + lines = content.split('\n') + current_section = [] + + for line in lines: + line = line.strip() + if not line: + continue + + # Check for ALL CAPS headings (major headings) + if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'): + if current_section: + self._process_section(doc, current_section) + current_section = [] + doc.add_heading(line, level=1) + # Check for Title Case headings (subheadings) + elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')): + if current_section: + self._process_section(doc, current_section) + current_section = [] + doc.add_heading(line, level=2) + # Check for markdown headings (fallback) + elif line.startswith('# '): + # H1 heading + if current_section: + self._process_section(doc, current_section) + current_section = [] + doc.add_heading(line[2:], level=1) + elif line.startswith('## '): + # H2 heading + if current_section: + self._process_section(doc, current_section) + current_section = [] + doc.add_heading(line[3:], level=2) + elif line.startswith('### '): + # H3 heading + if current_section: + self._process_section(doc, current_section) + current_section = [] + doc.add_heading(line[4:], level=3) + else: + current_section.append(line) + + # Process remaining content + if current_section: + self._process_section(doc, current_section) + + # Save to buffer + buffer = io.BytesIO() + doc.save(buffer) + buffer.seek(0) + + # Convert to base64 + docx_bytes = buffer.getvalue() + docx_base64 = base64.b64encode(docx_bytes).decode('utf-8') + + return docx_base64 + + except Exception as e: + self.logger.error(f"Error generating DOCX: {str(e)}") + raise + + def _setup_document_styles(self, doc): + """Set up document styles.""" + try: + # Set default font + style = doc.styles['Normal'] + font = style.font + font.name = 'Calibri' + font.size = Pt(11) + + # Set heading styles + for i in range(1, 4): + heading_style = doc.styles[f'Heading {i}'] + heading_font = heading_style.font + heading_font.name = 'Calibri' + heading_font.size = Pt(16 - i * 2) + heading_font.bold = True + except Exception as e: + self.logger.warning(f"Could not set up document styles: {str(e)}") + + def _process_section(self, doc, lines: list): + """Process a section of content into DOCX elements.""" + for line in lines: + if not line.strip(): + continue + + # Check for tables (lines with |) + if '|' in line and not line.startswith('|'): + # This might be part of a table, process as table + table_data = self._extract_table_data(lines) + if table_data: + self._add_table(doc, table_data) + return + + # Check for lists + if line.startswith('- ') or line.startswith('* '): + # This is a list item + doc.add_paragraph(line[2:], style='List Bullet') + elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')): + # This is a numbered list item + doc.add_paragraph(line[3:], style='List Number') + else: + # Regular paragraph + doc.add_paragraph(line) + + def _extract_table_data(self, lines: list) -> list: + """Extract table data from lines.""" + table_data = [] + in_table = False + + for line in lines: + if '|' in line: + if not in_table: + in_table = True + # Split by | and clean up + cells = [cell.strip() for cell in line.split('|') if cell.strip()] + if cells: + table_data.append(cells) + elif in_table and not line.strip(): + # Empty line, might be end of table + break + + return table_data if len(table_data) > 1 else [] + + def _add_table(self, doc, table_data: list): + """Add a table to the document.""" + try: + if not table_data: + return + + # Create table + table = doc.add_table(rows=len(table_data), cols=len(table_data[0])) + table.alignment = WD_TABLE_ALIGNMENT.CENTER + + # Add data to table + for row_idx, row_data in enumerate(table_data): + for col_idx, cell_data in enumerate(row_data): + if col_idx < len(table.rows[row_idx].cells): + table.rows[row_idx].cells[col_idx].text = cell_data + + # Style the table + self._style_table(table) + + except Exception as e: + self.logger.warning(f"Could not add table: {str(e)}") + + def _style_table(self, table): + """Apply styling to the table.""" + try: + # Style header row + if len(table.rows) > 0: + header_cells = table.rows[0].cells + for cell in header_cells: + for paragraph in cell.paragraphs: + for run in paragraph.runs: + run.bold = True + except Exception as e: + self.logger.warning(f"Could not style table: {str(e)}") \ No newline at end of file diff --git a/modules/services/serviceGeneration/renderers/excel_renderer.py b/modules/services/serviceGeneration/renderers/excel_renderer.py new file mode 100644 index 00000000..1472201b --- /dev/null +++ b/modules/services/serviceGeneration/renderers/excel_renderer.py @@ -0,0 +1,210 @@ +""" +Excel renderer for report generation using openpyxl. +""" + +from .base_renderer import BaseRenderer +from typing import Dict, Any, Tuple, List +import io +import base64 +from datetime import datetime, UTC + +try: + from openpyxl import Workbook + from openpyxl.styles import Font, PatternFill, Alignment, Border, Side + from openpyxl.utils import get_column_letter + from openpyxl.worksheet.table import Table, TableStyleInfo + OPENPYXL_AVAILABLE = True +except ImportError: + OPENPYXL_AVAILABLE = False + +class ExcelRenderer(BaseRenderer): + """Renders content to Excel format using openpyxl.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported Excel formats.""" + return ['xlsx', 'xls', 'excel'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['spreadsheet', 'workbook'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for Excel renderer.""" + return 110 + + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """Return only Excel-specific guidelines; global prompt is built centrally.""" + return ( + "EXCEL FORMAT GUIDELINES:\n" + "- Output one or more pipe-delimited tables with a single header row.\n" + "- Let user intent define columns; use clear names and ISO dates.\n" + "- Separate multiple tables by a single blank line.\n" + "- No markdown/HTML/code fences; tables only unless user explicitly asks for notes.\n" + "OUTPUT: Return ONLY pipe-delimited tables suitable for import." + ) + + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """Render extracted content to Excel format.""" + try: + if not OPENPYXL_AVAILABLE: + # Fallback to CSV if openpyxl not available + from .csv_renderer import CsvRenderer + csv_renderer = CsvRenderer() + csv_content, _ = await csv_renderer.render(extracted_content, title) + return csv_content, "text/csv" + + # Generate Excel using openpyxl + excel_content = self._generate_excel(extracted_content, title) + + return excel_content, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + + except Exception as e: + self.logger.error(f"Error rendering Excel: {str(e)}") + # Return CSV fallback + return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv" + + def _generate_excel(self, content: str, title: str) -> str: + """Generate Excel content using openpyxl.""" + try: + # Create workbook + wb = Workbook() + + # Remove default sheet + wb.remove(wb.active) + + # Create sheets + summary_sheet = wb.create_sheet("Summary", 0) + data_sheet = wb.create_sheet("Data", 1) + analysis_sheet = wb.create_sheet("Analysis", 2) + + # Add content to sheets + self._populate_summary_sheet(summary_sheet, title) + self._populate_data_sheet(data_sheet, content) + self._populate_analysis_sheet(analysis_sheet, content) + + # Save to buffer + buffer = io.BytesIO() + wb.save(buffer) + buffer.seek(0) + + # Convert to base64 + excel_bytes = buffer.getvalue() + excel_base64 = base64.b64encode(excel_bytes).decode('utf-8') + + return excel_base64 + + except Exception as e: + self.logger.error(f"Error generating Excel: {str(e)}") + raise + + def _populate_summary_sheet(self, sheet, title: str): + """Populate the summary sheet.""" + try: + # Title + sheet['A1'] = title + sheet['A1'].font = Font(size=16, bold=True) + sheet['A1'].alignment = Alignment(horizontal='center') + + # Generation info + sheet['A3'] = "Generated:" + sheet['B3'] = self._format_timestamp() + sheet['A4'] = "Status:" + sheet['B4'] = "Generated Successfully" + + # Key metrics placeholder + sheet['A6'] = "Key Metrics:" + sheet['A6'].font = Font(bold=True) + sheet['A7'] = "Total Items:" + sheet['B7'] = "=COUNTA(Data!A:A)-1" # Count non-empty cells in Data sheet + + # Auto-adjust column widths + sheet.column_dimensions['A'].width = 20 + sheet.column_dimensions['B'].width = 30 + + except Exception as e: + self.logger.warning(f"Could not populate summary sheet: {str(e)}") + + def _populate_data_sheet(self, sheet, content: str): + """Populate the data sheet.""" + try: + # Headers + headers = ["Item/Category", "Value/Amount", "Percentage", "Source Document", "Notes/Comments"] + for col, header in enumerate(headers, 1): + cell = sheet.cell(row=1, column=col, value=header) + cell.font = Font(bold=True) + cell.fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid") + + # Process content + lines = content.split('\n') + row = 2 + + for line in lines: + line = line.strip() + if not line: + continue + + # Check for table data (lines with |) + if '|' in line: + cells = [cell.strip() for cell in line.split('|') if cell.strip()] + for col, cell_data in enumerate(cells[:5], 1): # Limit to 5 columns + sheet.cell(row=row, column=col, value=cell_data) + row += 1 + else: + # Regular content + sheet.cell(row=row, column=1, value=line) + row += 1 + + # Auto-adjust column widths + for col in range(1, 6): + sheet.column_dimensions[get_column_letter(col)].width = 20 + + except Exception as e: + self.logger.warning(f"Could not populate data sheet: {str(e)}") + + def _populate_analysis_sheet(self, sheet, content: str): + """Populate the analysis sheet.""" + try: + # Title + sheet['A1'] = "Analysis & Insights" + sheet['A1'].font = Font(size=14, bold=True) + + # Content analysis + lines = content.split('\n') + row = 3 + + sheet['A3'] = "Content Analysis:" + sheet['A3'].font = Font(bold=True) + row += 1 + + # Count different types of content + table_lines = sum(1 for line in lines if '|' in line) + list_lines = sum(1 for line in lines if line.startswith(('- ', '* '))) + text_lines = len(lines) - table_lines - list_lines + + sheet[f'A{row}'] = f"Total Lines: {len(lines)}" + row += 1 + sheet[f'A{row}'] = f"Table Rows: {table_lines}" + row += 1 + sheet[f'A{row}'] = f"List Items: {list_lines}" + row += 1 + sheet[f'A{row}'] = f"Text Lines: {text_lines}" + row += 2 + + # Recommendations + sheet[f'A{row}'] = "Recommendations:" + sheet[f'A{row}'].font = Font(bold=True) + row += 1 + sheet[f'A{row}'] = "1. Review data accuracy" + row += 1 + sheet[f'A{row}'] = "2. Consider additional analysis" + row += 1 + sheet[f'A{row}'] = "3. Update regularly" + + # Auto-adjust column width + sheet.column_dimensions['A'].width = 30 + + except Exception as e: + self.logger.warning(f"Could not populate analysis sheet: {str(e)}") diff --git a/modules/services/serviceGeneration/renderers/html_renderer.py b/modules/services/serviceGeneration/renderers/html_renderer.py new file mode 100644 index 00000000..c2b7e586 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/html_renderer.py @@ -0,0 +1,69 @@ +""" +HTML renderer for report generation. +""" + +from .base_renderer import BaseRenderer +from typing import Dict, Any, Tuple, List + +class HtmlRenderer(BaseRenderer): + """Renders content to HTML format with format-specific extraction.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported HTML formats.""" + return ['html', 'htm'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['web', 'webpage'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for HTML renderer.""" + return 100 + + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """Return only HTML-specific guidelines; global prompt is built centrally.""" + return ( + "HTML FORMAT GUIDELINES:\n" + "- Output a complete HTML5 document starting with .\n" + "- Include , with and , and <body>.\n" + "- Use semantic elements: <header>, <main>, <section>, <article>, <footer>.\n" + "- Provide professional CSS in a <style> block; responsive, clean typography.\n" + "- Use h1/h2/h3 for headings; tables and lists for structure.\n" + "OUTPUT: Return ONLY valid HTML (no markdown, no code fences)." + ) + + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """Render extracted content to HTML format.""" + try: + # The extracted content should already be HTML from the AI + # Just clean it up and ensure it's valid + html_content = self._clean_html_content(extracted_content, title) + + return html_content, "text/html" + + except Exception as e: + self.logger.error(f"Error rendering HTML: {str(e)}") + # Return minimal HTML fallback + return f"<html><head><title>{title}

{title}

Error rendering report: {str(e)}

", "text/html" + + def _clean_html_content(self, content: str, title: str) -> str: + """Clean and validate HTML content from AI.""" + content = content.strip() + + # Remove markdown code blocks if present + if content.startswith("```") and content.endswith("```"): + lines = content.split('\n') + if len(lines) > 2: + content = '\n'.join(lines[1:-1]).strip() + + # Ensure it starts with DOCTYPE + if not content.startswith('\n' + content + else: + content = f'\n\n{title}\n\n{content}\n\n' + + return content diff --git a/modules/services/serviceGeneration/renderers/json_renderer.py b/modules/services/serviceGeneration/renderers/json_renderer.py new file mode 100644 index 00000000..845d33c2 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/json_renderer.py @@ -0,0 +1,74 @@ +""" +JSON renderer for report generation. +""" + +from .base_renderer import BaseRenderer +from typing import Dict, Any, Tuple, List +import json + +class JsonRenderer(BaseRenderer): + """Renders content to JSON format with format-specific extraction.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported JSON formats.""" + return ['json'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['data'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for JSON renderer.""" + return 80 + + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """Return only JSON-specific guidelines; global prompt is built centrally.""" + return ( + "JSON FORMAT GUIDELINES:\n" + "- Output ONLY a single valid JSON object (no fences, no pre/post text).\n" + "- Choose a structure that best fits the user's intent; include a top-level title and data.\n" + "- Prefer arrays/objects that map cleanly to the extracted facts.\n" + "- Include minimal metadata only if useful (e.g., generatedAt, sources).\n" + "OUTPUT: Return ONLY valid, parseable JSON." + ) + + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """Render extracted content to JSON format.""" + try: + # The extracted content should already be JSON from the AI + # Just validate and format it + json_content = self._clean_json_content(extracted_content, title) + + return json_content, "application/json" + + except Exception as e: + self.logger.error(f"Error rendering JSON: {str(e)}") + # Return minimal JSON fallback + fallback_data = { + "title": title, + "sections": [{"type": "text", "content": f"Error rendering report: {str(e)}"}], + "metadata": {"error": str(e)} + } + return json.dumps(fallback_data, indent=2), "application/json" + + def _clean_json_content(self, content: str, title: str) -> str: + """Clean and validate JSON content from AI.""" + content = content.strip() + + # Remove markdown code blocks if present + if content.startswith("```") and content.endswith("```"): + lines = content.split('\n') + if len(lines) > 2: + content = '\n'.join(lines[1:-1]).strip() + + # Validate JSON + try: + parsed = json.loads(content) + # Re-format with proper indentation + return json.dumps(parsed, indent=2, ensure_ascii=False) + except json.JSONDecodeError: + # If not valid JSON, return as-is + return content diff --git a/modules/services/serviceGeneration/renderers/markdown_renderer.py b/modules/services/serviceGeneration/renderers/markdown_renderer.py new file mode 100644 index 00000000..8b9b4293 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/markdown_renderer.py @@ -0,0 +1,65 @@ +""" +Markdown renderer for report generation. +""" + +from .base_renderer import BaseRenderer +from typing import Dict, Any, Tuple, List + +class MarkdownRenderer(BaseRenderer): + """Renders content to Markdown format with format-specific extraction.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported Markdown formats.""" + return ['md', 'markdown'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['mdown', 'mkd'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for markdown renderer.""" + return 95 + + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """Return only Markdown-specific guidelines; global prompt is built centrally.""" + return ( + "MARKDOWN FORMAT GUIDELINES:\n" + "- Use proper Markdown syntax only (no HTML wrappers).\n" + "- # for main title, ## for sections, ### for subsections.\n" + "- Tables with | separators and a header row.\n" + "- Bullet lists with - or *.\n" + "- Emphasis with **bold** and *italic*.\n" + "- Code blocks with ```language.\n" + "- Horizontal rules (---) to separate major sections when helpful.\n" + "- Include links [text](url) and images ![alt](url) when referenced by sources.\n" + "OUTPUT: Return ONLY raw Markdown content without code fences." + ) + + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """Render extracted content to Markdown format.""" + try: + # The extracted content should already be Markdown from the AI + # Just clean it up + markdown_content = self._clean_markdown_content(extracted_content, title) + + return markdown_content, "text/markdown" + + except Exception as e: + self.logger.error(f"Error rendering markdown: {str(e)}") + # Return minimal markdown fallback + return f"# {title}\n\nError rendering report: {str(e)}", "text/markdown" + + def _clean_markdown_content(self, content: str, title: str) -> str: + """Clean and validate Markdown content from AI.""" + content = content.strip() + + # Remove markdown code blocks if present + if content.startswith("```") and content.endswith("```"): + lines = content.split('\n') + if len(lines) > 2: + content = '\n'.join(lines[1:-1]).strip() + + return content diff --git a/modules/services/serviceGeneration/renderers/pdf_renderer.py b/modules/services/serviceGeneration/renderers/pdf_renderer.py new file mode 100644 index 00000000..6a8409a1 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/pdf_renderer.py @@ -0,0 +1,225 @@ +""" +PDF renderer for report generation using reportlab. +""" + +from .base_renderer import BaseRenderer +from typing import Dict, Any, Tuple, List +import io +import base64 +from datetime import datetime, UTC + +try: + from reportlab.lib.pagesizes import letter, A4 + from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak + from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle + from reportlab.lib.units import inch + from reportlab.lib import colors + from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY + REPORTLAB_AVAILABLE = True +except ImportError: + REPORTLAB_AVAILABLE = False + +class PdfRenderer(BaseRenderer): + """Renders content to PDF format using reportlab.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported PDF formats.""" + return ['pdf'] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return ['document', 'print'] + + @classmethod + def get_priority(cls) -> int: + """Return priority for PDF renderer.""" + return 120 + + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """Return only PDF-specific guidelines; global prompt is built centrally.""" + return ( + "PDF FORMAT GUIDELINES:\n" + "- Provide structured content suitable for pagination and headings (H1/H2/H3-like).\n" + "- Use bullet lists and tables where useful; separate major sections clearly.\n" + "- Avoid markdown/HTML; produce clean, plain content that can be laid out as PDF.\n" + "OUTPUT: Return ONLY the PDF-ready textual content (no fences)." + ) + + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """Render extracted content to PDF format.""" + try: + if not REPORTLAB_AVAILABLE: + # Fallback to HTML if reportlab not available + from .html_renderer import HtmlRenderer + html_renderer = HtmlRenderer() + html_content, _ = await html_renderer.render(extracted_content, title) + return html_content, "text/html" + + # Generate PDF using reportlab + pdf_content = self._generate_pdf(extracted_content, title) + + return pdf_content, "application/pdf" + + except Exception as e: + self.logger.error(f"Error rendering PDF: {str(e)}") + # Return minimal fallback + return f"PDF Generation Error: {str(e)}", "text/plain" + + def _generate_pdf(self, content: str, title: str) -> str: + """Generate PDF content using reportlab.""" + try: + # Create a buffer to hold the PDF + buffer = io.BytesIO() + + # Create PDF document + doc = SimpleDocTemplate( + buffer, + pagesize=A4, + rightMargin=72, + leftMargin=72, + topMargin=72, + bottomMargin=18 + ) + + # Get styles + styles = getSampleStyleSheet() + + # Create custom styles + title_style = ParagraphStyle( + 'CustomTitle', + parent=styles['Heading1'], + fontSize=24, + spaceAfter=30, + alignment=TA_CENTER, + textColor=colors.darkblue + ) + + heading_style = ParagraphStyle( + 'CustomHeading', + parent=styles['Heading2'], + fontSize=16, + spaceAfter=12, + spaceBefore=12, + textColor=colors.darkblue + ) + + # Build PDF content + story = [] + + # Title page + story.append(Paragraph(title, title_style)) + story.append(Spacer(1, 20)) + story.append(Paragraph(f"Generated: {self._format_timestamp()}", styles['Normal'])) + story.append(PageBreak()) + + # Process content + lines = content.split('\n') + current_section = [] + + for line in lines: + line = line.strip() + if not line: + continue + + # Check for headings + if line.startswith('# '): + # H1 heading + if current_section: + story.extend(self._process_section(current_section, styles)) + current_section = [] + story.append(Paragraph(line[2:], title_style)) + story.append(Spacer(1, 12)) + elif line.startswith('## '): + # H2 heading + if current_section: + story.extend(self._process_section(current_section, styles)) + current_section = [] + story.append(Paragraph(line[3:], heading_style)) + story.append(Spacer(1, 8)) + elif line.startswith('### '): + # H3 heading + if current_section: + story.extend(self._process_section(current_section, styles)) + current_section = [] + story.append(Paragraph(line[4:], styles['Heading3'])) + story.append(Spacer(1, 6)) + else: + current_section.append(line) + + # Process remaining content + if current_section: + story.extend(self._process_section(current_section, styles)) + + # Build PDF + doc.build(story) + + # Get PDF content as base64 + buffer.seek(0) + pdf_bytes = buffer.getvalue() + pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8') + + return pdf_base64 + + except Exception as e: + self.logger.error(f"Error generating PDF: {str(e)}") + raise + + def _process_section(self, lines: list, styles) -> list: + """Process a section of content into PDF elements.""" + elements = [] + + for line in lines: + if not line.strip(): + continue + + # Check for tables (lines with |) + if '|' in line and not line.startswith('|'): + # This might be part of a table, process as table + table_data = self._extract_table_data(lines) + if table_data: + table = Table(table_data) + table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('FONTSIZE', (0, 0), (-1, 0), 14), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), + ('GRID', (0, 0), (-1, -1), 1, colors.black) + ])) + elements.append(table) + elements.append(Spacer(1, 12)) + return elements + + # Check for lists + if line.startswith('- ') or line.startswith('* '): + # This is a list item + elements.append(Paragraph(f"• {line[2:]}", styles['Normal'])) + else: + # Regular paragraph + elements.append(Paragraph(line, styles['Normal'])) + + elements.append(Spacer(1, 6)) + return elements + + def _extract_table_data(self, lines: list) -> list: + """Extract table data from lines.""" + table_data = [] + in_table = False + + for line in lines: + if '|' in line: + if not in_table: + in_table = True + # Split by | and clean up + cells = [cell.strip() for cell in line.split('|') if cell.strip()] + if cells: + table_data.append(cells) + elif in_table and not line.strip(): + # Empty line, might be end of table + break + + return table_data if len(table_data) > 1 else [] \ No newline at end of file diff --git a/modules/services/serviceGeneration/renderers/registry.py b/modules/services/serviceGeneration/renderers/registry.py new file mode 100644 index 00000000..5c498081 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/registry.py @@ -0,0 +1,157 @@ +""" +Renderer registry for automatic discovery and registration of renderers. +""" + +import logging +import importlib +import pkgutil +from typing import Dict, Type, List, Optional +from .base_renderer import BaseRenderer + +logger = logging.getLogger(__name__) + +class RendererRegistry: + """Registry for automatic renderer discovery and management.""" + + def __init__(self): + self._renderers: Dict[str, Type[BaseRenderer]] = {} + self._format_mappings: Dict[str, str] = {} + self._discovered = False + + def discover_renderers(self) -> None: + """Automatically discover and register all renderers by scanning files.""" + if self._discovered: + return + + try: + import os + import sys + from pathlib import Path + + # Get the directory containing this registry file + current_dir = Path(__file__).parent + renderers_dir = current_dir + + # Get the package name dynamically + package_name = __name__.rsplit('.', 1)[0] + + # Scan all Python files in the renderers directory + for file_path in renderers_dir.glob("*.py"): + if file_path.name in ['registry.py', 'base_renderer.py', '__init__.py']: + continue + + # Extract module name from filename + module_name = file_path.stem + + try: + # Import the module dynamically + full_module_name = f"{package_name}.{module_name}" + module = importlib.import_module(full_module_name) + + # Look for renderer classes in the module + for attr_name in dir(module): + attr = getattr(module, attr_name) + if (isinstance(attr, type) and + issubclass(attr, BaseRenderer) and + attr != BaseRenderer and + hasattr(attr, 'get_supported_formats')): + + # Register the renderer + self._register_renderer_class(attr) + logger.info(f"Discovered renderer: {attr.__name__} from {module_name}") + + except Exception as e: + logger.warning(f"Could not load renderer from {module_name}: {str(e)}") + continue + + self._discovered = True + logger.info(f"Renderer discovery completed. Found {len(self._renderers)} renderers.") + + except Exception as e: + logger.error(f"Error during renderer discovery: {str(e)}") + self._discovered = True # Mark as discovered to avoid repeated attempts + + def _register_renderer_class(self, renderer_class: Type[BaseRenderer]) -> None: + """Register a renderer class with its supported formats.""" + try: + # Get supported formats from the renderer class + supported_formats = renderer_class.get_supported_formats() + + for format_name in supported_formats: + # Register primary format + self._renderers[format_name.lower()] = renderer_class + + # Register aliases if any + if hasattr(renderer_class, 'get_format_aliases'): + aliases = renderer_class.get_format_aliases() + for alias in aliases: + self._format_mappings[alias.lower()] = format_name.lower() + + logger.debug(f"Registered {renderer_class.__name__} for formats: {supported_formats}") + + except Exception as e: + logger.error(f"Error registering renderer {renderer_class.__name__}: {str(e)}") + + def get_renderer(self, output_format: str) -> Optional[BaseRenderer]: + """Get a renderer instance for the specified format.""" + if not self._discovered: + self.discover_renderers() + + # Normalize format name + format_name = output_format.lower().strip() + + # Check for aliases first + if format_name in self._format_mappings: + format_name = self._format_mappings[format_name] + + # Get renderer class + renderer_class = self._renderers.get(format_name) + + if renderer_class: + try: + return renderer_class() + except Exception as e: + logger.error(f"Error creating renderer instance for {format_name}: {str(e)}") + return None + + logger.warning(f"No renderer found for format: {output_format}") + return None + + def get_supported_formats(self) -> List[str]: + """Get list of all supported formats.""" + if not self._discovered: + self.discover_renderers() + + formats = list(self._renderers.keys()) + formats.extend(self._format_mappings.keys()) + return sorted(set(formats)) + + def get_renderer_info(self) -> Dict[str, Dict[str, str]]: + """Get information about all registered renderers.""" + if not self._discovered: + self.discover_renderers() + + info = {} + for format_name, renderer_class in self._renderers.items(): + info[format_name] = { + 'class_name': renderer_class.__name__, + 'module': renderer_class.__module__, + 'description': getattr(renderer_class, '__doc__', 'No description').strip().split('\n')[0] if renderer_class.__doc__ else 'No description' + } + + return info + +# Global registry instance +_registry = RendererRegistry() + +def get_renderer(output_format: str) -> Optional[BaseRenderer]: + """Get a renderer instance for the specified format.""" + return _registry.get_renderer(output_format) + +def get_supported_formats() -> List[str]: + """Get list of all supported formats.""" + return _registry.get_supported_formats() + +def get_renderer_info() -> Dict[str, Dict[str, str]]: + """Get information about all registered renderers.""" + return _registry.get_renderer_info() diff --git a/modules/services/serviceGeneration/renderers/text_renderer.py b/modules/services/serviceGeneration/renderers/text_renderer.py new file mode 100644 index 00000000..67e32069 --- /dev/null +++ b/modules/services/serviceGeneration/renderers/text_renderer.py @@ -0,0 +1,94 @@ +""" +Text renderer for report generation. +""" + +from .base_renderer import BaseRenderer +from typing import Dict, Any, Tuple, List + +class TextRenderer(BaseRenderer): + """Renders content to plain text format with format-specific extraction.""" + + @classmethod + def get_supported_formats(cls) -> List[str]: + """Return supported text formats (excluding formats with dedicated renderers).""" + return [ + 'txt', 'text', 'plain', + # Programming languages + 'js', 'javascript', 'ts', 'typescript', 'jsx', 'tsx', + 'py', 'python', 'java', 'cpp', 'c', 'h', 'hpp', + 'cs', 'csharp', 'php', 'rb', 'ruby', 'go', 'rs', 'rust', + 'swift', 'kt', 'kotlin', 'scala', 'r', 'm', 'objc', + 'sh', 'bash', 'zsh', 'fish', 'ps1', 'bat', 'cmd', + # Web technologies (excluding html/htm which have dedicated renderer) + 'css', 'scss', 'sass', 'less', 'xml', 'yaml', 'yml', 'toml', 'ini', 'cfg', + # Data formats (excluding csv, md/markdown which have dedicated renderers) + 'tsv', 'log', 'rst', 'sql', 'dockerfile', 'dockerignore', 'gitignore', + # Configuration files + 'env', 'properties', 'conf', 'config', 'rc', + 'gitattributes', 'editorconfig', 'eslintrc', + # Documentation + 'readme', 'changelog', 'license', 'authors', + 'contributing', 'todo', 'notes', 'docs' + ] + + @classmethod + def get_format_aliases(cls) -> List[str]: + """Return format aliases.""" + return [ + 'ascii', 'utf8', 'utf-8', 'code', 'source', + 'script', 'program', 'file', 'document', + 'raw', 'unformatted', 'plaintext' + ] + + @classmethod + def get_priority(cls) -> int: + """Return priority for text renderer.""" + return 90 + + def getExtractionPrompt(self, user_prompt: str, title: str) -> str: + """Return only plain-text guidelines; global prompt is built centrally.""" + return ( + "TEXT FORMAT GUIDELINES:\n" + "- Output ONLY plain text (no markdown or HTML).\n" + "- Use clear headings (you may underline with === or --- when helpful).\n" + "- Use simple bullet lists with '-' and tables with '|' when needed.\n" + "- Preserve indentation for code-like content if present.\n" + "OUTPUT: Return ONLY the raw text content." + ) + + async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: + """Render extracted content to plain text format.""" + try: + # The extracted content should already be formatted text from the AI + # Just clean it up + text_content = self._clean_text_content(extracted_content, title) + + return text_content, "text/plain" + + except Exception as e: + self.logger.error(f"Error rendering text: {str(e)}") + # Return minimal text fallback + return f"{title}\n\nError rendering report: {str(e)}", "text/plain" + + def _clean_text_content(self, content: str, title: str) -> str: + """Clean and validate text content from AI.""" + content = content.strip() + + # Remove markdown code blocks if present + if content.startswith("```") and content.endswith("```"): + lines = content.split('\n') + if len(lines) > 2: + content = '\n'.join(lines[1:-1]).strip() + + # Remove any remaining markdown formatting + content = content.replace('**', '').replace('*', '') + content = content.replace('__', '').replace('_', '') + + # Clean up any HTML-like tags that might have slipped through + import re + content = re.sub(r'<[^>]+>', '', content) + + # Ensure proper line endings + content = content.replace('\r\n', '\n').replace('\r', '\n') + + return content diff --git a/modules/services/serviceWorkflow/mainServiceWorkflow.py b/modules/services/serviceWorkflow/mainServiceWorkflow.py index 7ab66872..180779b5 100644 --- a/modules/services/serviceWorkflow/mainServiceWorkflow.py +++ b/modules/services/serviceWorkflow/mainServiceWorkflow.py @@ -3,7 +3,7 @@ import uuid from typing import Dict, Any, List, Optional from modules.datamodels.datamodelUam import User, UserConnection from modules.datamodels.datamodelChat import ChatDocument, ChatMessage -from modules.datamodels.datamodelChat import ExtractedContent +from modules.datamodels.datamodelChat import ChatContentExtracted from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService from modules.services.serviceGeneration.subDocumentUtility import getFileExtension, getMimeTypeFromExtension, detectContentTypeFromData from modules.shared.timezoneUtils import get_utc_timestamp @@ -78,6 +78,12 @@ class WorkflowService: def getChatDocumentsFromDocumentList(self, documentList: List[str]) -> List[ChatDocument]: """Get ChatDocuments from a list of document references using all three formats.""" try: + # Get the current workflow from services (same pattern as setWorkflowContext) + workflow = getattr(self.serviceCenter, 'currentWorkflow', None) or self.workflow + if not workflow: + logger.error("No workflow available for document list resolution") + return [] + all_documents = [] for doc_ref in documentList: if doc_ref.startswith("docItem:"): @@ -86,7 +92,7 @@ class WorkflowService: if len(parts) >= 2: doc_id = parts[1] # Find the document by ID - for message in self.workflow.messages: + for message in workflow.messages: if message.documents: for doc in message.documents: if doc.id == doc_id: @@ -101,9 +107,15 @@ class WorkflowService: # Format: docList::