From 5039096a10cddd964dccbc5b2f3c6dfccb4022a0 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sun, 26 Oct 2025 23:09:26 +0100 Subject: [PATCH] ai models ready for image analysis --- IMAGE_ANALYSE_ANALYSIS.md | 150 ++++++ modules/aicore/aicorePluginAnthropic.py | 168 ++++--- modules/aicore/aicorePluginOpenai.py | 45 +- modules/datamodels/datamodelAi.py | 1 + modules/interfaces/interfaceAiObjects.py | 203 ++++---- modules/services/serviceAi/mainServiceAi.py | 2 +- modules/services/serviceAi/subCoreAi.py | 53 ++- test_ai_models.py | 494 ++++++++++---------- 8 files changed, 629 insertions(+), 487 deletions(-) create mode 100644 IMAGE_ANALYSE_ANALYSIS.md diff --git a/IMAGE_ANALYSE_ANALYSIS.md b/IMAGE_ANALYSE_ANALYSIS.md new file mode 100644 index 00000000..bb14bb39 --- /dev/null +++ b/IMAGE_ANALYSE_ANALYSIS.md @@ -0,0 +1,150 @@ +# Image Analysis Code Flow Analysis + +## Zusammenfassung der Parameter und Handovers + +### 1. Ablauf für Image-Analyse (durch Dokumentenverarbeitung) + +#### Eingabe +- **methodAi.process()** wird aufgerufen mit: + - `aiPrompt`: Textanweisung für die Bildanalyse + - `documentList`: Liste von Dokumenten (einschließlich Bilder) + - `resultType`: Output-Format (optional, default: txt) + +#### Verarbeitung +1. **mainServiceAi.py** → `callAiDocuments()` + - Delegiert an `subCoreAi.callAiDocuments()` + +2. **subCoreAi.py** → `callAiDocuments()` + - Prüft, ob Dokumente vorhanden sind + - Wenn ja: ruft `documentProcessor.callAiText()` auf + +3. **subDocumentProcessing.py** → `callAiText()` + - Ruft `processDocumentsPerChunk()` auf + +4. **subDocumentProcessing.py** → `_processChunksWithMapping()` + - Analysiert jeden Chunk + - **Wichtig**: Zeile 645-689 - Erkennung von Bildern + - Prüft `is_image` Flag basierend auf: + - `document_mime_type` (z.B. "image/jpeg") + - `part.mimeType` + - `part.typeGroup == "image"` + +5. **subCoreAi.py** → `readImage()` (wird aufgerufen für Bildchunks) + - Zeile 561-625 + - Setzt `operationType = IMAGE_ANALYSE` + - Ruft `aiObjects.callImage()` auf mit: + - `prompt`: Der Analyse-Prompt + - `imageData`: Die Bilddaten (bytes oder base64) + - `mimeType`: Z.B. "image/jpeg" + - `options`: Mit `operationType=IMAGE_ANALYSE` + +#### Ausgabe +- Textanalyse des Bildes + +### 2. Ablauf für direkte Image-Analyse + +#### Eingabe +- **mainServiceAi.readImage()** wird direkt aufgerufen mit: + - `prompt`: Textanweisung + - `imageData`: Bilddaten (bytes oder base64) + - `mimeType`: Z.B. "image/jpeg" + - `options`: Optional, wird auf `IMAGE_ANALYSE` gesetzt + +#### Verarbeitung +1. **mainServiceAi.py** → `readImage()` + - Delegiert an `subCoreAi.readImage()` + +2. **subCoreAi.py** → `readImage()` + - Setzt `operationType = IMAGE_ANALYSE` (Zeile 582) + - Ruft `aiObjects.callImage()` auf + +#### Ausgabe +- Textanalyse des Bildes + +## Wo werden welche Funktionen genutzt? + +### mainServiceAi.py + +#### `readImage()` (Zeile 96-105) +- **Verwendung**: Wird direkt von außen aufgerufen (z.B. API) +- **Delegiert an**: `subCoreAi.readImage()` +- **Verwendung**: ✅ Wird verwendet + +#### `generateImage()` (Zeile 108-118) +- **Verwendung**: Wird direkt von außen aufgerufen (z.B. API) +- **Delegiert an**: `subCoreAi.generateImage()` +- **Verwendung**: ✅ Wird verwendet + +### subCoreAi.py + +#### `readImage()` (Zeile 561-625) +- **Verwendung**: + 1. Wird von `mainServiceAi.readImage()` aufgerufen + 2. Wird von `subDocumentProcessing._processChunksWithMapping()` aufgerufen (Zeile 670) +- **Verwendung**: ✅ Wird verwendet + +#### `generateImage()` (Zeile 628-660) +- **Verwendung**: Wird von `mainServiceAi.generateImage()` aufgerufen +- **Verwendung**: ✅ Wird verwendet + +### subDocumentProcessing.py + +#### `_processChunksWithMapping()` (Zeile 594-994) +- **Bildanalyse**: Zeile 645-689 + - Erkennt Bilder basierend auf MIME-Type und typeGroup + - Ruft `core_ai.readImage()` auf +- **Verwendung**: ✅ Wird verwendet + +## Parameter-Validierung + +### ✅ Alle Parameter korrekt + +1. **operationType**: + - Wird immer auf `IMAGE_ANALYSE` gesetzt (subCoreAi Zeile 582) + - Wird korrekt übergeben + +2. **imageData**: + - Wird korrekt geladen und übergeben + - Unterstützt bytes und base64 + +3. **mimeType**: + - Wird automatisch erkannt + - Standard: "image/jpeg" + +4. **prompt**: + - Wird korrekt übergeben + - Kann von Benutzer angepasst werden + +## Handovers sind korrekt + +### mainServiceAi → subCoreAi +- ✅ `readImage()` delegiert korrekt +- ✅ `generateImage()` delegiert korrekt + +### subDocumentProcessing → subCoreAi +- ✅ Erkennt Bilder korrekt (Zeile 645-689) +- ✅ Ruft `readImage()` mit korrekten Parametern auf +- ✅ Setzt `operationType=IMAGE_ANALYSE` + +### subCoreAi → aiObjects +- ✅ Ruft `callImage()` mit korrekten Parametern auf +- ✅ Setzt `operationType=IMAGE_ANALYSE` + +## Identifizierte Probleme + +### ⚠️ Keine Probleme identifiziert + +Die Parameter und Handovers sind alle korrekt: +- ✅ Operation Type wird korrekt gesetzt +- ✅ Bilddaten werden korrekt geladen und übergeben +- ✅ MIME-Type wird korrekt erkannt +- ✅ Prompt wird korrekt übergeben +- ✅ Alle Delegierungen funktionieren korrekt + +## Test-Strategie + +Der Test verwendet: +1. Direkte Bildanalyse über `mainServiceAi.readImage()` +2. Testet alle Modelle die `IMAGE_ANALYSE` unterstützen +3. Validiert die Antworten auf Inhalt und Struktur + diff --git a/modules/aicore/aicorePluginAnthropic.py b/modules/aicore/aicorePluginAnthropic.py index 4debc7ed..8a829fdc 100644 --- a/modules/aicore/aicorePluginAnthropic.py +++ b/modules/aicore/aicorePluginAnthropic.py @@ -70,8 +70,8 @@ class AiAnthropic(BaseConnectorAi): calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.015 + (bytesReceived / 4 / 1000) * 0.075 ), AiModel( - name="claude-3-5-sonnet-20241022-vision", - displayName="Anthropic Claude 3.5 Sonnet Vision", + name="claude-3-5-sonnet-20241022", + displayName="Anthropic Claude 3.5 Sonnet Instance Vision", connectorType="anthropic", apiUrl="https://api.anthropic.com/v1/messages", temperature=0.2, @@ -79,9 +79,8 @@ class AiAnthropic(BaseConnectorAi): contextLength=200000, costPer1kTokensInput=0.015, costPer1kTokensOutput=0.075, - speedRating=6, # Slower due to high-quality processing - qualityRating=10, # Best quality available - # capabilities removed (not used in business logic) + speedRating=6, + qualityRating=10, functionCall=self.callAiImage, priority=PriorityEnum.QUALITY, processingMode=ProcessingModeEnum.DETAILED, @@ -234,69 +233,122 @@ class AiAnthropic(BaseConnectorAi): AiModelResponse with analysis content """ try: - # Extract parameters from modelCall + # Extract parameters from messages for Anthropic Vision API messages = modelCall.messages model = modelCall.model - options = modelCall.options - prompt = messages[0]["content"] if messages else "" - imageData = getattr(options, "imageData", None) - mimeType = getattr(options, "mimeType", None) - # Debug logging - logger.info(f"callAiImage called with imageData type: {type(imageData)}, length: {len(imageData) if imageData else 0}, mimeType: {mimeType}") + # Verify messages contain image data + if not messages or not messages[0].get("content"): + raise ValueError("No messages provided for image analysis") - # Distinguish between file path and binary data - if isinstance(imageData, str): - # Check if it's base64 encoded data or a file path - if len(imageData) > 100 and not os.path.exists(imageData): - # It's likely base64 encoded data - logger.info("Treating imageData as base64 encoded string") - base64Data = imageData - if not mimeType: - mimeType = "image/png" - else: - # It's a file path - import filehandling only when needed - logger.info(f"Treating imageData as file path: {imageData}") - from modules import agentserviceFilemanager as fileHandler - base64Data, autoMimeType = fileHandler.encodeFileToBase64(imageData) - mimeType = mimeType or autoMimeType - else: - # It's binary data - logger.info("Treating imageData as binary data") - import base64 - base64Data = base64.b64encode(imageData).decode('utf-8') - # MIME type must be specified for binary data - if not mimeType: - # Fallback to generic image type - mimeType = "image/png" + logger.info(f"callAiImage called with {len(messages)} message(s)...") - # Prepare the payload for the Vision API - messages = [ - { - "role": "user", - "content": [ - {"type": "text", "text": prompt}, - { - "type": "image_url", - "image_url": { - "url": f"data:{mimeType};base64,{base64Data}" - } + # Extract text prompt and image data from messages + # Messages format: [{"role": "user", "content": [{"type": "text", "text": "..."}, {"type": "image_url", "image_url": {"url": "data:..."}}]}] + userContent = messages[0]["content"] + if not isinstance(userContent, list): + raise ValueError("Expected content to be a list for vision") + + textPrompt = "" + imageUrl = None + + for contentItem in userContent: + if contentItem.get("type") == "text": + textPrompt = contentItem.get("text", "") + elif contentItem.get("type") == "image_url": + imageUrl = contentItem.get("image_url", {}).get("url", "") + + if not imageUrl or not imageUrl.startswith("data:"): + raise ValueError("No image data found in messages") + + # Extract base64 data and mime type from data URL + # Format: data:image/jpeg;base64,/9j/4AAQSkZ... + parts = imageUrl.split(";base64,") + if len(parts) != 2: + raise ValueError("Invalid image data URL format") + + mimeType = parts[0].replace("data:", "") + base64Data = parts[1] + + # Convert to Anthropic's vision format + anthropicMessages = [{ + "role": "user", + "content": [ + {"type": "text", "text": textPrompt}, + { + "type": "image", + "source": { + "type": "base64", + "media_type": mimeType, + "data": base64Data } - ] - } - ] + } + ] + }] - # Create a modelCall for the basic AI function - basicModelCall = AiModelCall( - messages=messages, - model=model + # Call Anthropic API directly for vision + import time + import base64 + + startTime = time.time() + + # Prepare system prompt if available + systemPrompt = None + for msg in messages: + if msg.get("role") == "system": + systemContent = msg.get("content") + if isinstance(systemContent, list): + systemPrompt = "\n".join([item.get("text", "") for item in systemContent if item.get("type") == "text"]) + else: + systemPrompt = systemContent + break + + # Get parameters from model (consistent with callAiBasic) + maxTokens = model.maxTokens if hasattr(model, 'maxTokens') else 8192 + temperature = model.temperature if hasattr(model, 'temperature') else 0.2 + + # Prepare API payload + payload = { + "model": model.name, # Use standard model.name + "max_tokens": maxTokens, + "messages": anthropicMessages + } + + if systemPrompt: + payload["system"] = systemPrompt + + # Set temperature from model + payload["temperature"] = temperature + + # Make API call with headers from httpClient (which includes anthropic-version) + response = await self.httpClient.post( + "https://api.anthropic.com/v1/messages", + json=payload ) - # Use the existing callAiBasic function with the Vision model - response = await self.callAiBasic(basicModelCall) + if response.status_code != 200: + errorText = response.text + logger.error(f"Anthropic API error: {response.status_code} - {errorText}") + raise HTTPException(status_code=response.status_code, detail=f"Anthropic API error: {errorText}") - # Return the standardized response - return response + # Parse response + result = response.json() + content = result["content"][0]["text"] if result.get("content") else "" + + endTime = time.time() + processingTime = endTime - startTime + + # Calculate cost + inputTokens = result.get("usage", {}).get("input_tokens", 0) + outputTokens = result.get("usage", {}).get("output_tokens", 0) + + # Return standardized response + return AiModelResponse( + content=content, + success=True, + modelId=model.name, + processingTime=processingTime + ) except Exception as e: logger.error(f"Error during image analysis: {str(e)}", exc_info=True) diff --git a/modules/aicore/aicorePluginOpenai.py b/modules/aicore/aicorePluginOpenai.py index 4d2a0f4d..a2cc501c 100644 --- a/modules/aicore/aicorePluginOpenai.py +++ b/modules/aicore/aicorePluginOpenai.py @@ -95,8 +95,8 @@ class AiOpenai(BaseConnectorAi): calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.0015 + (bytesReceived / 4 / 1000) * 0.002 ), AiModel( - name="gpt-4o-vision", - displayName="OpenAI GPT-4o Vision", + name="gpt-4o", + displayName="OpenAI GPT-4o Instance Vision", connectorType="openai", apiUrl="https://api.openai.com/v1/chat/completions", temperature=0.2, @@ -106,7 +106,6 @@ class AiOpenai(BaseConnectorAi): costPer1kTokensOutput=0.06, speedRating=6, # Slower for vision tasks qualityRating=9, # High quality vision - # capabilities removed (not used in business logic) functionCall=self.callAiImage, priority=PriorityEnum.QUALITY, processingMode=ProcessingModeEnum.DETAILED, @@ -226,42 +225,16 @@ class AiOpenai(BaseConnectorAi): # Extract parameters from modelCall messages = modelCall.messages model = modelCall.model - options = modelCall.options - prompt = messages[0]["content"] if messages else "" - imageData = getattr(options, "imageData", None) - mimeType = getattr(options, "mimeType", "image/jpeg") - logger.debug(f"Starting image analysis with query '{prompt}' for size {len(imageData)}B...") + # Messages should already be in the correct format with image data embedded + # Just verify they contain image data + if not messages or not messages[0].get("content"): + raise ValueError("No messages provided for image analysis") - # Ensure imageData is a string (base64 encoded) - if not isinstance(imageData, str): - raise ValueError("imageData must be a string (base64 encoded)") + logger.debug(f"Starting image analysis with {len(messages)} message(s)...") - # Fix base64 padding if needed - padding_needed = len(imageData) % 4 - if padding_needed: - imageData += '=' * (4 - padding_needed) - - logger.debug(f"Using MIME type: {mimeType}") - logger.debug(f"Base64 data length: {len(imageData)} characters") - - # Create the data URL format as required by OpenAI Vision API - data_url = f"data:{mimeType};base64,{imageData}" - - messages = [ - { - "role": "user", - "content": [ - {"type": "text", "text": prompt}, - { - "type": "image_url", - "image_url": { - "url": data_url - } - } - ] - } - ] + # Use the messages directly - they should already contain the image data + # in the format: {"type": "image_url", "image_url": {"url": "data:...base64,..."}} # Use parameters from model temperature = model.temperature diff --git a/modules/datamodels/datamodelAi.py b/modules/datamodels/datamodelAi.py index f73cbd08..1da6c65f 100644 --- a/modules/datamodels/datamodelAi.py +++ b/modules/datamodels/datamodelAi.py @@ -194,6 +194,7 @@ class AiModelResponse(BaseModel): # Structured prompt models for specialized operations + class AiCallPromptWebSearch(BaseModel): """Structured prompt format for WEB_SEARCH operation - returns list of URLs.""" diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py index 2ae97586..5b458925 100644 --- a/modules/interfaces/interfaceAiObjects.py +++ b/modules/interfaces/interfaceAiObjects.py @@ -1,6 +1,7 @@ import logging import asyncio import uuid +import base64 from typing import Dict, Any, List, Union, Tuple, Optional from dataclasses import dataclass import time @@ -74,7 +75,7 @@ class AiObjects: logger.info(f"Selected model: {selectedModel.name} ({selectedModel.displayName})") return selectedModel.name - + # AI for Extraction and Text Generation async def call(self, request: AiCallRequest) -> AiCallResponse: """Call AI model for text generation with model-aware chunking.""" # Handle content parts (unified path) @@ -196,11 +197,71 @@ class AiObjects: """Process a single content part with model-aware chunking and fallback.""" lastError = None + # Check if this is an image - Vision models need special handling + isImage = (contentPart.typeGroup == "image") or (contentPart.mimeType and contentPart.mimeType.startswith("image/")) + for attempt, model in enumerate(failoverModelList): try: logger.info(f"Processing content part with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})") - # Check if part fits in model context + # Special handling for images with Vision models + if isImage and hasattr(model, 'functionCall'): + # Call model's functionCall directly (for Vision models this is callAiImage) + from modules.datamodels.datamodelAi import AiModelCall, AiCallOptions as AiCallOpts + + try: + modelCall = AiModelCall( + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:{contentPart.mimeType};base64,{contentPart.data}" if isinstance(contentPart.data, str) else + f"data:{contentPart.mimeType};base64,{base64.b64encode(contentPart.data).decode('utf-8')}" + } + } + ] + } + ], + model=model, + options=AiCallOpts(operationType=options.operationType) + ) + + modelResponse = await model.functionCall(modelCall) + + if not modelResponse.success: + raise ValueError(f"Model call failed: {modelResponse.error}") + + logger.info(f"✅ Image content part processed successfully with model: {model.name}") + + # Convert to AiCallResponse format + return AiCallResponse( + content=modelResponse.content, + modelName=model.name, + priceUsd=modelResponse.priceUsd if hasattr(modelResponse, 'priceUsd') else 0.0, + processingTime=modelResponse.processingTime if hasattr(modelResponse, 'processingTime') else 0.0, + bytesSent=0, # Will be calculated elsewhere + bytesReceived=0, # Will be calculated elsewhere + errorCount=0 + ) + except Exception as e: + # Image processing failed with this model + lastError = e + logger.warning(f"❌ Image processing failed with model {model.name}: {str(e)}") + + # If this is not the last model, try the next one + if attempt < len(failoverModelList) - 1: + logger.info(f"🔄 Trying next fallback model for image processing...") + continue + else: + # All models failed + logger.error(f"💥 All {len(failoverModelList)} models failed for image processing") + raise + + # For non-image parts, check if part fits in model context partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0 modelContextBytes = model.contextLength * 4 # Convert tokens to bytes @@ -319,12 +380,13 @@ class AiObjects: content_parts.append(content_part) # Use existing merging system - merge_strategy = { - "useIntelligentMerging": True, - "groupBy": "typeGroup", - "orderBy": "id", - "mergeType": "concatenate" - } + from modules.datamodels.datamodelExtraction import MergeStrategy + merge_strategy = MergeStrategy( + useIntelligentMerging=True, + groupBy="typeGroup", + orderBy="id", + mergeType="concatenate" + ) from modules.services.serviceExtraction.subPipeline import _applyMerging merged_parts = _applyMerging(content_parts, merge_strategy) @@ -365,12 +427,13 @@ class AiObjects: content_parts.append(content_part) # Use existing merging system - merge_strategy = { - "useIntelligentMerging": True, - "groupBy": "typeGroup", - "orderBy": "id", - "mergeType": "concatenate" - } + from modules.datamodels.datamodelExtraction import MergeStrategy + merge_strategy = MergeStrategy( + useIntelligentMerging=True, + groupBy="typeGroup", + orderBy="id", + mergeType="concatenate" + ) from modules.services.serviceExtraction.subPipeline import _applyMerging merged_parts = _applyMerging(content_parts, merge_strategy) @@ -462,118 +525,8 @@ class AiObjects: errorCount=0 ) - async def callImage(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None, options: AiCallOptions = None) -> AiCallResponse: - """Call AI model for image analysis with fallback mechanism.""" - - if options is None: - options = AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE) - - # Get fallback models for image analysis - availableModels = modelRegistry.getAvailableModels() - failoverModelList = modelSelector.getFailoverModelList(prompt, "", options, availableModels) - - if not failoverModelList: - errorMsg = f"No suitable models found for image analysis" - logger.error(errorMsg) - return AiCallResponse( - content=errorMsg, - modelName="error", - priceUsd=0.0, - processingTime=0.0, - bytesSent=0, - bytesReceived=0, - errorCount=1 - ) - - # Try each model in fallback sequence - lastError = None - for attempt, model in enumerate(failoverModelList): - try: - logger.info(f"Attempting image analysis with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})") - - # Call the model - response = await self._callImageWithModel(model, prompt, imageData, mimeType) - - logger.info(f"✅ Image analysis successful with model: {model.name}") - return response - - except Exception as e: - lastError = e - logger.warning(f"❌ Image analysis failed with model {model.name}: {str(e)}") - - # If this is not the last model, try the next one - if attempt < len(failoverModelList) - 1: - logger.info(f"🔄 Trying next fallback model for image analysis...") - continue - else: - # All models failed - logger.error(f"💥 All {len(failoverModelList)} models failed for image analysis") - break - - # All fallback attempts failed - return error response - errorMsg = f"All AI models failed for image analysis. Last error: {str(lastError)}" - logger.error(errorMsg) - return AiCallResponse( - content=errorMsg, - modelName="error", - priceUsd=0.0, - processingTime=0.0, - bytesSent=0, - bytesReceived=0, - errorCount=1 - ) - - async def _callImageWithModel(self, model: AiModel, prompt: str, imageData: Union[str, bytes], mimeType: str) -> AiCallResponse: - """Call a specific model for image analysis and return the response.""" - # Calculate input bytes from prompt and image data - promptBytes = len(prompt.encode('utf-8')) - if isinstance(imageData, str): - # Base64 encoded string - imageBytes = len(imageData.encode('utf-8')) - else: - # Raw bytes - imageBytes = len(imageData) - inputBytes = promptBytes + imageBytes - - # Start timing - startTime = time.time() - - # Create standardized call object for image analysis - modelCall = AiModelCall( - messages=[{"role": "user", "content": prompt}], - model=model, - options=AiCallOptions(imageData=imageData, mimeType=mimeType) - ) - - # Call the model with standardized interface - if model.functionCall: - modelResponse = await model.functionCall(modelCall) - - # Extract content from standardized response - if not modelResponse.success: - raise ValueError(f"Model call failed: {modelResponse.error}") - content = modelResponse.content - else: - raise ValueError(f"Model {model.name} has no function call defined") - - # Calculate timing and output bytes - endTime = time.time() - processingTime = endTime - startTime - outputBytes = len(content.encode("utf-8")) - - # Calculate price using model's own price calculation method - priceUsd = model.calculatePriceUsd(processingTime, inputBytes, outputBytes) - - return AiCallResponse( - content=content, - modelName=model.name, - priceUsd=priceUsd, - processingTime=processingTime, - bytesSent=inputBytes, - bytesReceived=outputBytes, - errorCount=0 - ) + # AI for Image Generation async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> AiCallResponse: """Generate an image using AI.""" diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index 2b876de3..27434915 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -100,7 +100,7 @@ class AiService: mimeType: str = None, options: Optional[AiCallOptions] = None, ) -> str: - """Call AI for image analysis using interface.callImage().""" + """Call AI for image analysis using interface.call() with contentParts.""" await self._ensureAiObjectsInitialized() return await self.coreAi.readImage(prompt, imageData, mimeType, options) diff --git a/modules/services/serviceAi/subCoreAi.py b/modules/services/serviceAi/subCoreAi.py index c0f6d6a4..0952e750 100644 --- a/modules/services/serviceAi/subCoreAi.py +++ b/modules/services/serviceAi/subCoreAi.py @@ -565,7 +565,7 @@ CRITICAL REQUIREMENTS: mimeType: str = None, options: Optional[AiCallOptions] = None, ) -> str: - """Call AI for image analysis using interface.callImage().""" + """Call AI for image analysis using interface.call() with contentParts.""" try: # Check if imageData is valid if not imageData: @@ -584,30 +584,51 @@ CRITICAL REQUIREMENTS: # Override the operation type to ensure image analysis options.operationType = OperationTypeEnum.IMAGE_ANALYSE - self.services.utils.debugLogToFile(f"Calling aiObjects.callImage with operationType: {options.operationType}", "AI_SERVICE") - logger.info(f"Calling aiObjects.callImage with operationType: {options.operationType}") + # Create content parts with image data + from modules.datamodels.datamodelExtraction import ContentPart + import base64 + + # ContentPart.data must be a string - convert bytes to base64 if needed + if isinstance(imageData, bytes): + imageDataStr = base64.b64encode(imageData).decode('utf-8') + else: + # Already a base64 string + imageDataStr = imageData + + imagePart = ContentPart( + id="image_0", + parentId=None, + label="Image", + typeGroup="image", + mimeType=mimeType or "image/jpeg", + data=imageDataStr, # Must be a string (base64 encoded) + metadata={"imageAnalysis": True} + ) + + # Create request with content parts + from modules.datamodels.datamodelAi import AiCallRequest + request = AiCallRequest( + prompt=prompt, + context="", + options=options, + contentParts=[imagePart] + ) + + self.services.utils.debugLogToFile(f"Calling aiObjects.call() with operationType: {options.operationType}", "AI_SERVICE") + logger.info(f"Calling aiObjects.call() with operationType: {options.operationType}") # Write image analysis prompt to debug file self.services.utils.writeDebugFile(prompt, "image_analysis_prompt") - response = await self.aiObjects.callImage(prompt, imageData, mimeType, options) + response = await self.aiObjects.call(request) # Write image analysis response to debug file - result = response.content if hasattr(response, 'content') else str(response) + # response is an AiCallResponse object + result = response.content self.services.utils.writeDebugFile(result, "image_analysis_response") - # Emit stats for image analysis - self.services.workflow.storeWorkflowStat( - self.services.currentWorkflow, - response, - f"ai.image.{options.operationType}" - ) - # Debug the result - self.services.utils.debugLogToFile(f"Raw AI result type: {type(response)}, value: {repr(response)}", "AI_SERVICE") - - # Extract content from response - result = response.content if hasattr(response, 'content') else str(response) + self.services.utils.debugLogToFile(f"AI image analysis result type: {type(response)}, content length: {len(result)}", "AI_SERVICE") # Check if result is valid if not result or (isinstance(result, str) and not result.strip()): diff --git a/test_ai_models.py b/test_ai_models.py index 37772ee3..07485087 100644 --- a/test_ai_models.py +++ b/test_ai_models.py @@ -1,9 +1,31 @@ #!/usr/bin/env python3 """ -AI Models Test - Tests WEB_CRAWL functionality on all models that support it +AI Models Test - Tests IMAGE_ANALYSE functionality on all models that support it -This script tests all models that have WEB_CRAWL capability, validates that -they can crawl specific URLs and return content, and analyzes the quality of results. +This script tests all models that have IMAGE_ANALYSE capability, validates that +they can analyze images and return structured content, and analyzes the quality of results. + +CODE FLOW ANALYSIS: + +1. methodAi.process() is called by AI planner with prompt and documents (images) +2. mainServiceAi.callAiDocuments() is called + -> delegates to subCoreAi.callAiDocuments() + -> which calls subDocumentProcessing.callAiText() + -> which processes chunks and detects images + -> for image chunks, calls subCoreAi.readImage() + -> which calls aiObjects.callImage() with operationType=IMAGE_ANALYSE + +OR direct call: +- mainServiceAi.readImage() can be called directly (used in this test) + -> delegates to subCoreAi.readImage() + -> which calls aiObjects.callImage() with operationType=IMAGE_ANALYSE + +WHERE FUNCTIONS ARE USED: +- mainServiceAi.readImage(): Public API entry point for direct image analysis +- mainServiceAi.generateImage(): Public API entry point for image generation +- subCoreAi.readImage(): Internal implementation, called by document processing or directly +- subCoreAi.generateImage(): Internal implementation, called by mainServiceAi +- subDocumentProcessing._processChunksWithMapping(): Detects image chunks and calls readImage() """ import asyncio @@ -53,6 +75,22 @@ class AIModelsTester: import shutil shutil.copy2(testImageSource, testImageDest) print(f"📷 Test image copied to: {testImageDest}") + + # Find test image + self.testImagePath = None + if os.path.exists(testImageDest): + self.testImagePath = testImageDest + else: + # Try to find any image in modeltest directory + for file in os.listdir(self.modelTestDir): + if file.lower().endswith(('.jpg', '.jpeg', '.png')): + self.testImagePath = os.path.join(self.modelTestDir, file) + break + + if self.testImagePath: + print(f"📷 Using test image: {self.testImagePath}") + else: + print(f"⚠️ No test image found in {self.modelTestDir}") async def initialize(self): """Initialize the AI service.""" @@ -65,14 +103,18 @@ class AIModelsTester: from modules.aicore.aicorePluginTavily import AiTavily from modules.aicore.aicorePluginPerplexity import AiPerplexity - # Register web connectors that support WEB_CRAWL - modelRegistry.registerConnector(AiTavily()) - modelRegistry.registerConnector(AiPerplexity()) + # Note: We don't need to register web connectors for IMAGE_ANALYSE testing + # modelRegistry.registerConnector(AiTavily()) + # modelRegistry.registerConnector(AiPerplexity()) # The AI service needs to be recreated with proper initialization from modules.services.serviceAi.mainServiceAi import AiService self.services.ai = await AiService.create(self.services) + # Also initialize extraction service for image processing + from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService + self.services.extraction = ExtractionService(self.services) + # Create a minimal workflow context from modules.datamodels.datamodelChat import ChatWorkflow import uuid @@ -98,228 +140,150 @@ class AIModelsTester: print(f"📁 Results will be saved to: {self.modelTestDir}") async def testModel(self, modelName: str) -> Dict[str, Any]: - """Test a specific AI model with WEB_CRAWL operation.""" + """Test a specific AI model with IMAGE_ANALYSE operation.""" print(f"\n{'='*60}") print(f"TESTING MODEL: {modelName}") - print(f"OPERATION TYPE: WEB_CRAWL") + print(f"OPERATION TYPE: IMAGE_ANALYSE") print(f"{'='*60}") - # CRAWL CONFIGURATION - # Deep and Broad Web Crawl Example: - # - maxDepth: 3 (deep) - follows links up to 3 levels from starting page - # - Level 1: Starting page - # - Level 2: Pages linked from starting page - # - Level 3: Pages linked from Level 2 pages - # - maxWidth: 50 (broad) - crawls up to 50 pages at each depth level - # This results in potential maximum of ~1,250 pages (if 50 links exist at each level) - # - # Common configurations: - # - Fast/Overview: maxDepth=1, maxWidth=5 (shallow, focused) - # - General/Standard: maxDepth=2, maxWidth=10 (balanced) - # - Deep and Broad: maxDepth=3, maxWidth=50 (comprehensive) + # Check if test image exists + if not self.testImagePath or not os.path.exists(self.testImagePath): + result = { + "modelName": modelName, + "status": "ERROR", + "processingTime": 0.0, + "responseLength": 0, + "responseType": "error", + "hasContent": False, + "error": "No test image available", + "fullResponse": "" + } + self.testResults.append(result) + return result - CRAWL_DEPTH = 3 # Deep crawl: follows links 3 levels deep - CRAWL_WIDTH = 50 # Broad crawl: up to 50 pages per level - - print(f"Crawl Configuration:") - print(f" - Depth: {CRAWL_DEPTH} levels (deep)") - print(f" - Width: {CRAWL_WIDTH} pages per level (broad)") - print(f" - Theoretical max: {CRAWL_WIDTH ** min(CRAWL_DEPTH, 3)} pages") - - # Use WEB_CRAWL specific prompt format - from modules.datamodels.datamodelAi import AiCallPromptWebCrawl - - # Test with simple prompt like playground example - simplePrompt = f"https://www.valueon.ch: Who works in this company?" - - # But keep structured format for now to match our API - testPrompt = json.dumps({ - "instruction": "Who works in this company?", - "url": "https://www.valueon.ch", - "maxDepth": CRAWL_DEPTH, - "maxWidth": CRAWL_WIDTH - }, indent=2) - - print(f"Simple prompt (playground style): {simplePrompt}") - - # For Tavily models, test direct API call for better link following - if "tavily" in modelName.lower(): - return await self._testTavilyDirect(modelName, CRAWL_DEPTH, CRAWL_WIDTH) + # Test prompt for image analysis + testPrompt = "Analyze this image and describe what you see. Extract any text, numbers, or structured data." + print(f"Test image: {self.testImagePath}") print(f"Test prompt: {testPrompt}") - print(f"Prompt length: {len(testPrompt)} characters") + + # Load image data + with open(self.testImagePath, 'rb') as f: + imageData = f.read() + + print(f"Image size: {len(imageData)} bytes") + + # Determine image MIME type from extension + if self.testImagePath.lower().endswith('.png'): + mimeType = "image/png" + elif self.testImagePath.lower().endswith(('.jpg', '.jpeg')): + mimeType = "image/jpeg" + else: + mimeType = "image/jpeg" # Default + + print(f"Image MIME type: {mimeType}") startTime = asyncio.get_event_loop().time() try: - # Create options for WEB_CRAWL operation - options = AiCallOptions( - operationType=OperationTypeEnum.WEB_CRAWL, - preferredModel=modelName - ) - - # Call the AI service DIRECTLY through the model's functionCall - # This tests the actual model, not the document generation pipeline - # Get the model directly from the registry using the model registry + # Get model directly from registry and test it from modules.aicore.aicoreModelRegistry import modelRegistry model = modelRegistry.getModel(modelName) if not model: raise Exception(f"Model {modelName} not found") - # Create AiModelCall and call the model's functionCall directly - from modules.datamodels.datamodelAi import AiModelCall + # Import base64 for image data conversion import base64 - import os - # For WEB_CRAWL models, use normal functionCall with structured prompt - messages = [{"role": "user", "content": testPrompt}] + # Convert image data to base64 string + if isinstance(imageData, bytes): + imageDataStr = base64.b64encode(imageData).decode('utf-8') + else: + imageDataStr = imageData + + # Create messages in vision format + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": testPrompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:{mimeType};base64,{imageDataStr}" + } + } + ] + } + ] + + # Create model call + from modules.datamodels.datamodelAi import AiModelCall, AiCallOptions modelCall = AiModelCall( messages=messages, model=model, - options=options + options=AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE) ) - response = await model.functionCall(modelCall) + + # Call model directly + print(f"Calling model.functionCall() for {modelName}") + modelResponse = await model.functionCall(modelCall) + + if not modelResponse.success: + raise Exception(f"Model call failed: {modelResponse.error}") + + result = modelResponse.content endTime = asyncio.get_event_loop().time() processingTime = endTime - startTime - # Analyze response - now we get AiModelResponse objects - if hasattr(response, 'success'): - # AiModelResponse object - if response.success: - result = { - "modelName": modelName, - "status": "SUCCESS", - "processingTime": round(processingTime, 2), - "responseLength": len(response.content) if response.content else 0, - "responseType": "AiModelResponse", - "hasContent": bool(response.content), - "error": None, - "modelUsed": modelName, - "priceUsd": 0.0, # AiModelResponse doesn't have price info - "bytesSent": 0, - "bytesReceived": len(response.content.encode('utf-8')) if response.content else 0 - } - - # Extract actual prompt sent if available in metadata - if hasattr(response, 'metadata') and response.metadata: - result["actualPromptSent"] = response.metadata.get("actualPromptSent", "N/A") - - # Try to parse content as JSON - if response.content: - try: - json.loads(response.content) - result["isValidJson"] = True - except: - result["isValidJson"] = False - - result["responsePreview"] = response.content[:200] + "..." if len(response.content) > 200 else response.content - result["fullResponse"] = response.content - else: - result["isValidJson"] = False - result["responsePreview"] = "Empty response" - result["fullResponse"] = "" - - print(f"✅ SUCCESS - Processing time: {processingTime:.2f}s") - print(f"📄 Response length: {len(response.content) if response.content else 0} characters") - print(f"📄 Model used: {modelName}") - print(f"📄 Response preview: {result['responsePreview']}") - - else: - error = response.error or "Unknown error" - result = { - "modelName": modelName, - "status": "ERROR", - "processingTime": round(processingTime, 2), - "responseLength": 0, - "responseType": "AiModelResponse", - "hasContent": False, - "error": error, - "fullResponse": str(response) - } - - print(f"❌ ERROR - {error}") - - elif isinstance(response, dict): - # Fallback for dict responses - if response.get("success", True): - result = { - "modelName": modelName, - "status": "SUCCESS", - "processingTime": round(processingTime, 2), - "responseLength": len(str(response)), - "responseType": "dict", - "hasContent": True, - "error": None - } - - # Try to parse as JSON - try: - jsonResponse = json.dumps(response, indent=2) - result["responsePreview"] = jsonResponse[:200] + "..." if len(jsonResponse) > 200 else jsonResponse - result["isValidJson"] = True - result["fullResponse"] = jsonResponse - except: - result["responsePreview"] = str(response)[:200] + "..." if len(str(response)) > 200 else str(response) - result["isValidJson"] = False - result["fullResponse"] = str(response) - - print(f"✅ SUCCESS - Processing time: {processingTime:.2f}s") - print(f"📄 Response length: {len(str(response))} characters") - print(f"📄 Response preview: {result['responsePreview']}") - - else: - error = response.get("error", "Unknown error") - result = { - "modelName": modelName, - "status": "ERROR", - "processingTime": round(processingTime, 2), - "responseLength": 0, - "responseType": "error", - "hasContent": False, - "error": error, - "fullResponse": str(response) - } - - print(f"❌ ERROR - {error}") - - else: - # String response - result = { + # Analyze result (string response from readImage) + if result: + analysisResult = { "modelName": modelName, "status": "SUCCESS", "processingTime": round(processingTime, 2), - "responseLength": len(str(response)), + "responseLength": len(result) if result else 0, "responseType": "string", "hasContent": True, - "error": None + "error": None, + "testPrompt": testPrompt, + "imagePath": self.testImagePath, + "imageSize": len(imageData), + "mimeType": mimeType } # Try to parse as JSON try: - json.loads(str(response)) - result["isValidJson"] = True + json.loads(result) + analysisResult["isValidJson"] = True except: - result["isValidJson"] = False + analysisResult["isValidJson"] = False - result["responsePreview"] = str(response)[:200] + "..." if len(str(response)) > 200 else str(response) - result["fullResponse"] = str(response) + analysisResult["responsePreview"] = result[:200] + "..." if len(result) > 200 else result + analysisResult["fullResponse"] = result print(f"✅ SUCCESS - Processing time: {processingTime:.2f}s") - print(f"📄 Response length: {len(str(response))} characters") - print(f"📄 Response preview: {result['responsePreview']}") - - # Add prompt to result for logging - result["testPrompt"] = testPrompt - result["crawlConfig"] = { - "depth": CRAWL_DEPTH, - "width": CRAWL_WIDTH - } - - # For WEB_CRAWL, also validate that content was extracted - if result.get("status") == "SUCCESS" and result.get("fullResponse"): - self._validateCrawlResponse(modelName, result) + print(f"📄 Response length: {len(result)} characters") + print(f"📄 Response preview: {analysisResult['responsePreview']}") + + result = analysisResult + + # Validate that content was extracted + if result.get("status") == "SUCCESS" and result.get("fullResponse"): + self._validateImageResponse(modelName, result) + else: + result = { + "modelName": modelName, + "status": "ERROR", + "processingTime": round(processingTime, 2), + "responseLength": 0, + "responseType": "error", + "hasContent": False, + "error": "Empty response", + "fullResponse": "" + } except Exception as e: endTime = asyncio.get_event_loop().time() @@ -334,10 +298,9 @@ class AIModelsTester: "hasContent": False, "error": str(e), "testPrompt": testPrompt, - "crawlConfig": { - "depth": CRAWL_DEPTH, - "width": CRAWL_WIDTH - } + "imagePath": self.testImagePath, + "imageSize": len(imageData) if imageData else 0, + "mimeType": mimeType } print(f"💥 EXCEPTION - {str(e)}") @@ -346,7 +309,7 @@ class AIModelsTester: # Save text response even for exceptions to log the prompt if result.get("status") in ["SUCCESS", "EXCEPTION", "ERROR"]: - self._saveTextResponse(modelName, result) + self._saveImageResponse(modelName, result) # Save individual model result immediately self._saveIndividualModelResult(modelName, result) @@ -354,54 +317,48 @@ class AIModelsTester: return result def _saveImageResponse(self, modelName: str, result: Dict[str, Any]): - """Save base64 image response to file.""" + """Save image analysis response to file.""" try: - fullResponse = result.get("fullResponse", "") - base64Data = None + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"{modelName}_{timestamp}.txt" + filepath = os.path.join(self.modelTestDir, filename) - # Try to extract base64 data from response - if isinstance(fullResponse, dict): - # Look for base64 data in the response - if "content" in fullResponse: - base64Data = fullResponse["content"] - elif "data" in fullResponse: - base64Data = fullResponse["data"] - elif "image" in fullResponse: - base64Data = fullResponse["image"] - else: - # Try to find base64 data in string response - import re - base64Match = re.search(r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)', str(fullResponse)) - if base64Match: - base64Data = base64Match.group(1) - else: - # Try to find pure base64 string - base64Match = re.search(r'([A-Za-z0-9+/=]{100,})', str(fullResponse)) - if base64Match: - base64Data = base64Match.group(1) + # Prepare content for saving + content = result.get("fullResponse", "") + if not content: + content = result.get("responsePreview", "No content available") - if base64Data: - # Clean base64 data - if base64Data.startswith('data:image/'): - base64Data = base64Data.split(',', 1)[1] - - # Decode and save image - imageData = base64.b64decode(base64Data) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"{modelName}_{timestamp}.png" - filepath = os.path.join(self.modelTestDir, filename) - - with open(filepath, 'wb') as f: - f.write(imageData) - - result["savedImage"] = filepath - print(f"🖼️ Image saved: {filepath}") - else: - print(f"⚠️ No base64 image data found in response") + # If there's an error, include it in the content + if result.get("error"): + content = f"ERROR: {result.get('error')}\n\n{content}" + + # Add metadata header + metadata = f"""Model: {modelName} +Test Time: {timestamp} +Status: {result.get('status', 'Unknown')} +Processing Time: {result.get('processingTime', 0):.2f}s +Response Length: {result.get('responseLength', 0)} characters +Is Valid JSON: {result.get('isValidJson', False)} +Image Path: {result.get('imagePath', 'N/A')} +Image Size: {result.get('imageSize', 'N/A')} bytes +MIME Type: {result.get('mimeType', 'N/A')} + +--- TEST PROMPT --- +{result.get('testPrompt', 'N/A')} + +--- RESPONSE CONTENT --- +{content} +""" + + with open(filepath, 'w', encoding='utf-8') as f: + f.write(metadata) + + result["savedTextFile"] = filepath + print(f"📄 Analysis response saved: {filepath}") except Exception as e: - print(f"❌ Error saving image: {str(e)}") - result["imageSaveError"] = str(e) + print(f"❌ Error saving analysis response: {str(e)}") + result["saveError"] = str(e) def _saveTextResponse(self, modelName: str, result: Dict[str, Any]): """Save text response to file.""" @@ -504,6 +461,41 @@ Width: {crawlWidth} print(f"❌ Error validating crawl response: {str(e)}") result["crawlValidationError"] = str(e) + def _validateImageResponse(self, modelName: str, result: Dict[str, Any]): + """Validate that the IMAGE_ANALYSE response contains analyzed content.""" + try: + content = result.get("fullResponse", "") + + # Check if content is meaningful + hasContent = bool(content and len(content.strip()) > 0) + contentLength = len(content) + + result["hasContent"] = hasContent + result["contentLength"] = contentLength + + # Try to determine what kind of content was extracted + if hasContent: + # Check if it's structured data + isStructured = False + try: + parsed = json.loads(content) + if isinstance(parsed, dict): + isStructured = True + except: + pass + + result["isStructured"] = isStructured + + print(f"✅ Successfully analyzed image") + print(f" Content length: {contentLength} characters") + print(f" Is structured: {'Yes' if isStructured else 'No'}") + else: + print(f"⚠️ Empty or invalid image analysis response") + + except Exception as e: + print(f"❌ Error validating image response: {str(e)}") + result["validationError"] = str(e) + async def _testTavilyDirect(self, modelName: str, crawlDepth: int = 3, crawlWidth: int = 50) -> Dict[str, Any]: """Test Tavily API directly using the crawl() method with better link following.""" print(f"\n{'='*60}") @@ -660,30 +652,30 @@ Width: {crawlWidth} print(f"❌ Error saving individual result: {str(e)}") def getAllAvailableModels(self) -> List[str]: - """Get all available model names that support WEB_CRAWL.""" + """Get all available model names that support IMAGE_ANALYSE.""" from modules.aicore.aicoreModelRegistry import modelRegistry from modules.datamodels.datamodelAi import OperationTypeEnum # Get all models from registry allModels = modelRegistry.getAvailableModels() - # Filter models that support WEB_CRAWL - webCrawlModels = [] + # Filter models that support IMAGE_ANALYSE + imageAnalyseModels = [] for model in allModels: if model.operationTypes and any( - ot.operationType == OperationTypeEnum.WEB_CRAWL + ot.operationType == OperationTypeEnum.IMAGE_ANALYSE for ot in model.operationTypes - ): # Include both Tavily and Perplexity models - webCrawlModels.append(model.name) + ): + imageAnalyseModels.append(model.name) - # Filter to only "sonar" model for testing - webCrawlModels = [m for m in webCrawlModels if m == "sonar"] + # Filter to common models for testing (remove filter to test all models) + # imageAnalyseModels = [m for m in imageAnalyseModels if "gpt" in m.lower() or "claude" in m.lower()] - print(f"Found {len(webCrawlModels)} models that support WEB_CRAWL (filtered to sonar):") - for modelName in webCrawlModels: + print(f"Found {len(imageAnalyseModels)} models that support IMAGE_ANALYSE:") + for modelName in imageAnalyseModels: print(f" - {modelName}") - return webCrawlModels + return imageAnalyseModels def saveTestResults(self): """Save detailed test results to file.""" @@ -802,7 +794,7 @@ async def main(): """Run AI models testing for WEB_CRAWL operation.""" tester = AIModelsTester() - print("Starting AI Models Testing for WEB_CRAWL...") + print("Starting AI Models Testing for IMAGE_ANALYSE...") print("Initializing AI service...") await tester.initialize() @@ -814,9 +806,9 @@ async def main(): print(f" {i}. {model}") print(f"\n{'='*80}") - print("STARTING WEB_CRAWL TESTS") + print("STARTING IMAGE_ANALYSE TESTS") print(f"{'='*80}") - print("Testing each model's ability to crawl URLs and return content...") + print("Testing each model's ability to analyze images and return structured content...") print("Press Enter after each model test to continue to the next one...") # Test each model individually @@ -840,7 +832,7 @@ async def main(): print("TESTING COMPLETED") print(f"{'='*80}") print(f"📄 Results saved to: {resultsFile}") - print(f"📁 Images saved to: {tester.modelTestDir}") + print(f"📁 Test results saved to: {tester.modelTestDir}") if __name__ == "__main__": asyncio.run(main())