1250 lines
54 KiB
Python
1250 lines
54 KiB
Python
import logging
|
|
import asyncio
|
|
from typing import Dict, Any, List, Union, Tuple, Optional
|
|
from dataclasses import dataclass
|
|
import time
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
from modules.aicore.aicoreModelRegistry import modelRegistry
|
|
from modules.aicore.aicoreModelSelector import modelSelector
|
|
from modules.datamodels.datamodelAi import (
|
|
AiModel,
|
|
AiCallOptions,
|
|
AiCallRequest,
|
|
AiCallResponse,
|
|
OperationTypeEnum,
|
|
)
|
|
from modules.datamodels.datamodelWeb import (
|
|
WebResearchRequest,
|
|
WebResearchActionResult,
|
|
WebSearchResultItem,
|
|
WebCrawlResultItem,
|
|
WebSearchRequest,
|
|
WebCrawlRequest,
|
|
)
|
|
from modules.datamodels.datamodelChat import ActionDocument
|
|
|
|
|
|
# Dynamic model registry - models are now loaded from connectors via aicore system
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class AiObjects:
|
|
"""Centralized AI interface: dynamically discovers and uses AI models. Includes web functionality."""
|
|
|
|
def __post_init__(self) -> None:
|
|
# Auto-discover and register all available connectors
|
|
self._discoverAndRegisterConnectors()
|
|
|
|
def _discoverAndRegisterConnectors(self):
|
|
"""Auto-discover and register all available AI connectors."""
|
|
logger.info("Auto-discovering AI connectors...")
|
|
|
|
# Use the model registry's built-in discovery mechanism
|
|
discoveredConnectors = modelRegistry.discoverConnectors()
|
|
|
|
# Register each discovered connector
|
|
for connector in discoveredConnectors:
|
|
modelRegistry.registerConnector(connector)
|
|
logger.info(f"Registered connector: {connector.getConnectorType()}")
|
|
|
|
logger.info(f"Total connectors registered: {len(discoveredConnectors)}")
|
|
logger.info("All AI connectors registered with dynamic model registry")
|
|
|
|
@classmethod
|
|
async def create(cls) -> "AiObjects":
|
|
"""Create AiObjects instance with auto-discovered connectors."""
|
|
# No need to manually create connectors - they're auto-discovered
|
|
return cls()
|
|
|
|
|
|
|
|
def _selectModel(self, prompt: str, context: str, options: AiCallOptions) -> str:
|
|
"""Select the best model using dynamic model selection system."""
|
|
# Get available models from the dynamic registry
|
|
availableModels = modelRegistry.getAvailableModels()
|
|
|
|
if not availableModels:
|
|
logger.error("No models available in the registry")
|
|
raise ValueError("No AI models available")
|
|
|
|
# Use the dynamic model selector
|
|
selectedModel = modelSelector.selectModel(prompt, context, options, availableModels)
|
|
|
|
if not selectedModel:
|
|
logger.error("No suitable model found for the given criteria")
|
|
raise ValueError("No suitable AI model found")
|
|
|
|
logger.info(f"Selected model: {selectedModel.name} ({selectedModel.displayName})")
|
|
return selectedModel.name
|
|
|
|
|
|
async def call(self, request: AiCallRequest) -> AiCallResponse:
|
|
"""Call AI model for text generation with model-aware chunking."""
|
|
# Handle content parts (unified path)
|
|
if hasattr(request, 'contentParts') and request.contentParts:
|
|
return await self._callWithContentParts(request)
|
|
# Handle traditional text/context calls
|
|
return await self._callWithTextContext(request)
|
|
|
|
async def _callWithTextContext(self, request: AiCallRequest) -> AiCallResponse:
|
|
"""Call AI model for traditional text/context calls with fallback mechanism."""
|
|
prompt = request.prompt
|
|
context = request.context or ""
|
|
options = request.options
|
|
|
|
# Calculate input bytes
|
|
inputBytes = len((prompt + context).encode("utf-8"))
|
|
|
|
# Compress optionally (prompt/context) - simple truncation fallback kept here
|
|
def _maybeTruncate(text: str, limit: int) -> str:
|
|
data = text.encode("utf-8")
|
|
if len(data) <= limit:
|
|
return text
|
|
return data[:limit].decode("utf-8", errors="ignore") + "... [truncated]"
|
|
|
|
if options.compressPrompt and len(prompt.encode("utf-8")) > 2000:
|
|
prompt = _maybeTruncate(prompt, 2000)
|
|
if options.compressContext and len(context.encode("utf-8")) > 70000:
|
|
context = _maybeTruncate(context, 70000)
|
|
|
|
# Derive generation parameters
|
|
temperature = getattr(options, "temperature", None)
|
|
if temperature is None:
|
|
temperature = 0.2
|
|
maxTokens = getattr(options, "maxTokens", None)
|
|
|
|
# Get failover models for this operation type
|
|
availableModels = modelRegistry.getAvailableModels()
|
|
failoverModelList = modelSelector.getFailoverModelList(prompt, context, options, availableModels)
|
|
|
|
if not failoverModelList:
|
|
errorMsg = f"No suitable models found for operation {options.operationType}"
|
|
logger.error(errorMsg)
|
|
return AiCallResponse(
|
|
content=errorMsg,
|
|
modelName="error",
|
|
priceUsd=0.0,
|
|
processingTime=0.0,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=0,
|
|
errorCount=1
|
|
)
|
|
|
|
# Try each model in failover sequence
|
|
lastError = None
|
|
for attempt, model in enumerate(failoverModelList):
|
|
try:
|
|
logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
|
|
|
|
# Call the model
|
|
response = await self._callWithModel(model, prompt, context, temperature, maxTokens, inputBytes)
|
|
|
|
logger.info(f"✅ AI call successful with model: {model.name}")
|
|
return response
|
|
|
|
except Exception as e:
|
|
lastError = e
|
|
logger.warning(f"❌ AI call failed with model {model.name}: {str(e)}")
|
|
|
|
# If this is not the last model, try the next one
|
|
if attempt < len(failoverModelList) - 1:
|
|
logger.info(f"🔄 Trying next failover model...")
|
|
continue
|
|
else:
|
|
# All models failed
|
|
logger.error(f"💥 All {len(failoverModelList)} models failed for operation {options.operationType}")
|
|
break
|
|
|
|
# All failover attempts failed - return error response
|
|
errorMsg = f"All AI models failed for operation {options.operationType}. Last error: {str(lastError)}"
|
|
logger.error(errorMsg)
|
|
return AiCallResponse(
|
|
content=errorMsg,
|
|
modelName="error",
|
|
priceUsd=0.0,
|
|
processingTime=0.0,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=0,
|
|
errorCount=1
|
|
)
|
|
|
|
async def _callWithContentParts(self, request: AiCallRequest) -> AiCallResponse:
|
|
"""Process content parts with model-aware chunking (unified for single and multiple parts)."""
|
|
prompt = request.prompt
|
|
options = request.options
|
|
contentParts = request.contentParts
|
|
|
|
# Get failover models
|
|
availableModels = modelRegistry.getAvailableModels()
|
|
failoverModelList = modelSelector.getFailoverModelList(prompt, "", options, availableModels)
|
|
|
|
if not failoverModelList:
|
|
return self._createErrorResponse("No suitable models found", 0, 0)
|
|
|
|
# Process each content part
|
|
allResults = []
|
|
for contentPart in contentParts:
|
|
partResult = await self._processContentPartWithFallback(contentPart, prompt, options, failoverModelList)
|
|
allResults.append(partResult)
|
|
|
|
# Merge all results
|
|
mergedContent = self._mergePartResults(allResults)
|
|
|
|
return AiCallResponse(
|
|
content=mergedContent,
|
|
modelName="multiple",
|
|
priceUsd=sum(r.priceUsd for r in allResults),
|
|
processingTime=sum(r.processingTime for r in allResults),
|
|
bytesSent=sum(r.bytesSent for r in allResults),
|
|
bytesReceived=sum(r.bytesReceived for r in allResults),
|
|
errorCount=sum(r.errorCount for r in allResults)
|
|
)
|
|
|
|
async def _processContentPartWithFallback(self, contentPart, prompt: str, options, failoverModelList) -> AiCallResponse:
|
|
"""Process a single content part with model-aware chunking and fallback."""
|
|
lastError = None
|
|
|
|
for attempt, model in enumerate(failoverModelList):
|
|
try:
|
|
logger.info(f"Processing content part with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
|
|
|
|
# Check if part fits in model context
|
|
partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0
|
|
modelContextBytes = model.contextLength * 4 # Convert tokens to bytes
|
|
|
|
if partSize <= modelContextBytes:
|
|
# Part fits - call AI directly
|
|
response = await self._callWithModel(model, prompt, contentPart.data, 0.2, None, partSize)
|
|
logger.info(f"✅ Content part processed successfully with model: {model.name}")
|
|
return response
|
|
else:
|
|
# Part too large - chunk it
|
|
chunks = await self._chunkContentPart(contentPart, model, options)
|
|
if not chunks:
|
|
raise ValueError(f"Failed to chunk content part for model {model.name}")
|
|
|
|
# Process each chunk
|
|
chunkResults = []
|
|
for chunk in chunks:
|
|
chunkResponse = await self._callWithModel(model, prompt, chunk['data'], 0.2, None, chunk['size'])
|
|
chunkResults.append(chunkResponse)
|
|
|
|
# Merge chunk results
|
|
mergedContent = self._mergeChunkResults(chunkResults)
|
|
totalPrice = sum(r.priceUsd for r in chunkResults)
|
|
totalTime = sum(r.processingTime for r in chunkResults)
|
|
totalBytesSent = sum(r.bytesSent for r in chunkResults)
|
|
totalBytesReceived = sum(r.bytesReceived for r in chunkResults)
|
|
totalErrors = sum(r.errorCount for r in chunkResults)
|
|
|
|
logger.info(f"✅ Content part chunked and processed with model: {model.name} ({len(chunks)} chunks)")
|
|
return AiCallResponse(
|
|
content=mergedContent,
|
|
modelName=model.name,
|
|
priceUsd=totalPrice,
|
|
processingTime=totalTime,
|
|
bytesSent=totalBytesSent,
|
|
bytesReceived=totalBytesReceived,
|
|
errorCount=totalErrors
|
|
)
|
|
|
|
except Exception as e:
|
|
lastError = e
|
|
logger.warning(f"❌ Model {model.name} failed for content part: {str(e)}")
|
|
|
|
if attempt < len(failoverModelList) - 1:
|
|
logger.info(f"🔄 Trying next failover model...")
|
|
continue
|
|
else:
|
|
logger.error(f"💥 All {len(failoverModelList)} models failed for content part")
|
|
break
|
|
|
|
# All models failed
|
|
return self._createErrorResponse(f"All models failed: {str(lastError)}", 0, 0)
|
|
|
|
async def _chunkContentPart(self, contentPart, model, options) -> List[Dict[str, Any]]:
|
|
"""Chunk a content part based on model capabilities."""
|
|
# Calculate model-specific chunk sizes
|
|
modelContextBytes = model.contextLength * 4 # Convert tokens to bytes
|
|
maxContextBytes = int(modelContextBytes * 0.9) # 90% of context length
|
|
textChunkSize = int(maxContextBytes * 0.7) # 70% of max context for text chunks
|
|
imageChunkSize = int(maxContextBytes * 0.8) # 80% of max context for image chunks
|
|
|
|
# Build chunking options
|
|
chunkingOptions = {
|
|
"textChunkSize": textChunkSize,
|
|
"imageChunkSize": imageChunkSize,
|
|
"maxSize": maxContextBytes,
|
|
"chunkAllowed": True
|
|
}
|
|
|
|
# Get appropriate chunker
|
|
from modules.services.serviceExtraction.subRegistry import ChunkerRegistry
|
|
chunkerRegistry = ChunkerRegistry()
|
|
chunker = chunkerRegistry.resolve(contentPart.typeGroup)
|
|
|
|
if not chunker:
|
|
logger.warning(f"No chunker found for typeGroup: {contentPart.typeGroup}")
|
|
return []
|
|
|
|
# Chunk the content part
|
|
try:
|
|
chunks = chunker.chunk(contentPart, chunkingOptions)
|
|
logger.debug(f"Created {len(chunks)} chunks for {contentPart.typeGroup} part")
|
|
return chunks
|
|
except Exception as e:
|
|
logger.error(f"Chunking failed for {contentPart.typeGroup}: {str(e)}")
|
|
return []
|
|
|
|
def _mergePartResults(self, partResults: List[AiCallResponse]) -> str:
|
|
"""Merge part results using the existing sophisticated merging system."""
|
|
if not partResults:
|
|
return ""
|
|
|
|
# Convert AiCallResponse results to ContentParts for merging
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
from modules.services.serviceExtraction.subUtils import makeId
|
|
|
|
content_parts = []
|
|
for i, result in enumerate(partResults):
|
|
if result.content:
|
|
content_part = ContentPart(
|
|
id=makeId(),
|
|
parentId=None,
|
|
label=f"ai_result_{i}",
|
|
typeGroup="text", # Default to text for AI results
|
|
mimeType="text/plain",
|
|
data=result.content,
|
|
metadata={
|
|
"aiResult": True,
|
|
"modelName": result.modelName,
|
|
"priceUsd": result.priceUsd,
|
|
"processingTime": result.processingTime,
|
|
"bytesSent": result.bytesSent,
|
|
"bytesReceived": result.bytesReceived
|
|
}
|
|
)
|
|
content_parts.append(content_part)
|
|
|
|
# Use existing merging system
|
|
merge_strategy = {
|
|
"useIntelligentMerging": True,
|
|
"groupBy": "typeGroup",
|
|
"orderBy": "id",
|
|
"mergeType": "concatenate"
|
|
}
|
|
|
|
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
|
merged_parts = _applyMerging(content_parts, merge_strategy)
|
|
|
|
# Convert merged parts back to final string
|
|
final_content = "\n\n".join([part.data for part in merged_parts])
|
|
|
|
logger.info(f"Merged {len(partResults)} AI results using existing merging system")
|
|
return final_content.strip()
|
|
|
|
def _mergeChunkResults(self, chunkResults: List[AiCallResponse]) -> str:
|
|
"""Merge chunk results using the existing sophisticated merging system."""
|
|
if not chunkResults:
|
|
return ""
|
|
|
|
# Convert AiCallResponse results to ContentParts for merging
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
from modules.services.serviceExtraction.subUtils import makeId
|
|
|
|
content_parts = []
|
|
for i, result in enumerate(chunkResults):
|
|
if result.content:
|
|
content_part = ContentPart(
|
|
id=makeId(),
|
|
parentId=None,
|
|
label=f"chunk_result_{i}",
|
|
typeGroup="text", # Default to text for AI results
|
|
mimeType="text/plain",
|
|
data=result.content,
|
|
metadata={
|
|
"aiResult": True,
|
|
"chunk": True,
|
|
"modelName": result.modelName,
|
|
"priceUsd": result.priceUsd,
|
|
"processingTime": result.processingTime,
|
|
"bytesSent": result.bytesSent,
|
|
"bytesReceived": result.bytesReceived
|
|
}
|
|
)
|
|
content_parts.append(content_part)
|
|
|
|
# Use existing merging system
|
|
merge_strategy = {
|
|
"useIntelligentMerging": True,
|
|
"groupBy": "typeGroup",
|
|
"orderBy": "id",
|
|
"mergeType": "concatenate"
|
|
}
|
|
|
|
from modules.services.serviceExtraction.subPipeline import _applyMerging
|
|
merged_parts = _applyMerging(content_parts, merge_strategy)
|
|
|
|
# Convert merged parts back to final string
|
|
final_content = "\n\n".join([part.data for part in merged_parts])
|
|
|
|
logger.info(f"Merged {len(chunkResults)} chunk results using existing merging system")
|
|
return final_content.strip()
|
|
|
|
def _createErrorResponse(self, errorMsg: str, inputBytes: int, outputBytes: int) -> AiCallResponse:
|
|
"""Create an error response."""
|
|
return AiCallResponse(
|
|
content=errorMsg,
|
|
modelName="error",
|
|
priceUsd=0.0,
|
|
processingTime=0.0,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=outputBytes,
|
|
errorCount=1
|
|
)
|
|
|
|
async def _callWithModel(self, model: AiModel, prompt: str, context: str, temperature: float, maxTokens: int, inputBytes: int) -> AiCallResponse:
|
|
"""Call a specific model and return the response."""
|
|
# Replace <TOKEN_LIMIT> placeholder in prompt for this specific model
|
|
contextLength = model.contextLength
|
|
if contextLength > 0:
|
|
tokenLimit = str(contextLength)
|
|
else:
|
|
tokenLimit = "16000" # Default for text generation
|
|
|
|
# Create a copy of the prompt for this model call
|
|
modelPrompt = prompt
|
|
if "<TOKEN_LIMIT>" in modelPrompt:
|
|
modelPrompt = modelPrompt.replace("<TOKEN_LIMIT>", tokenLimit)
|
|
logger.debug(f"Replaced <TOKEN_LIMIT> with {tokenLimit} for model {model.name}")
|
|
|
|
# Update messages array with replaced content
|
|
messages = []
|
|
if context:
|
|
messages.append({"role": "system", "content": f"Context from documents:\n{context}"})
|
|
messages.append({"role": "user", "content": modelPrompt})
|
|
|
|
# Start timing
|
|
startTime = time.time()
|
|
|
|
# Get the connector for this model
|
|
connector = modelRegistry.getConnectorForModel(model.name)
|
|
if not connector:
|
|
raise ValueError(f"No connector found for model {model.name}")
|
|
|
|
# Call the model's function directly
|
|
if model.functionCall:
|
|
# Use the model's function call directly
|
|
if model.name.startswith("perplexity_callAiWithWebSearch"):
|
|
query = modelPrompt
|
|
if context:
|
|
query = f"Context: {context}\n\nQuery: {modelPrompt}"
|
|
content = await model.functionCall(query, temperature=temperature, maxTokens=maxTokens)
|
|
elif model.name.startswith("perplexity_researchTopic"):
|
|
content = await model.functionCall(modelPrompt)
|
|
elif model.name.startswith("perplexity_answerQuestion"):
|
|
content = await model.functionCall(modelPrompt, context)
|
|
elif model.name.startswith("perplexity_getCurrentNews"):
|
|
content = await model.functionCall(modelPrompt)
|
|
else:
|
|
# Standard callAiBasic
|
|
if model.connectorType == "anthropic":
|
|
response = await model.functionCall(messages, temperature=temperature, maxTokens=maxTokens)
|
|
content = response["choices"][0]["message"]["content"]
|
|
else:
|
|
content = await model.functionCall(messages, temperature=temperature, maxTokens=maxTokens)
|
|
else:
|
|
raise ValueError(f"Model {model.name} has no function call defined")
|
|
|
|
# Calculate timing and output bytes
|
|
endTime = time.time()
|
|
processingTime = endTime - startTime
|
|
outputBytes = len(content.encode("utf-8"))
|
|
|
|
# Calculate price using model's own price calculation method
|
|
priceUsd = model.calculatePriceUsd(inputBytes, outputBytes)
|
|
|
|
return AiCallResponse(
|
|
content=content,
|
|
modelName=model.name,
|
|
priceUsd=priceUsd,
|
|
processingTime=processingTime,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=outputBytes,
|
|
errorCount=0
|
|
)
|
|
|
|
async def callImage(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None, options: AiCallOptions = None) -> AiCallResponse:
|
|
"""Call AI model for image analysis with fallback mechanism."""
|
|
|
|
if options is None:
|
|
options = AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE)
|
|
|
|
# Calculate input bytes (prompt + image data)
|
|
inputBytes = len(prompt.encode("utf-8")) + len(imageData) if isinstance(imageData, bytes) else len(prompt.encode("utf-8")) + len(str(imageData).encode("utf-8"))
|
|
|
|
# Get fallback models for image analysis
|
|
availableModels = modelRegistry.getAvailableModels()
|
|
failoverModelList = modelSelector.getFailoverModelList(prompt, "", options, availableModels)
|
|
|
|
if not failoverModelList:
|
|
errorMsg = f"No suitable models found for image analysis"
|
|
logger.error(errorMsg)
|
|
return AiCallResponse(
|
|
content=errorMsg,
|
|
modelName="error",
|
|
priceUsd=0.0,
|
|
processingTime=0.0,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=0,
|
|
errorCount=1
|
|
)
|
|
|
|
# Try each model in fallback sequence
|
|
lastError = None
|
|
for attempt, model in enumerate(failoverModelList):
|
|
try:
|
|
logger.info(f"Attempting image analysis with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
|
|
|
|
# Call the model
|
|
response = await self._callImageWithModel(model, prompt, imageData, mimeType, inputBytes)
|
|
|
|
logger.info(f"✅ Image analysis successful with model: {model.name}")
|
|
return response
|
|
|
|
except Exception as e:
|
|
lastError = e
|
|
logger.warning(f"❌ Image analysis failed with model {model.name}: {str(e)}")
|
|
|
|
# If this is not the last model, try the next one
|
|
if attempt < len(failoverModelList) - 1:
|
|
logger.info(f"🔄 Trying next fallback model for image analysis...")
|
|
continue
|
|
else:
|
|
# All models failed
|
|
logger.error(f"💥 All {len(failoverModelList)} models failed for image analysis")
|
|
break
|
|
|
|
# All fallback attempts failed - return error response
|
|
errorMsg = f"All AI models failed for image analysis. Last error: {str(lastError)}"
|
|
logger.error(errorMsg)
|
|
return AiCallResponse(
|
|
content=errorMsg,
|
|
modelName="error",
|
|
priceUsd=0.0,
|
|
processingTime=0.0,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=0,
|
|
errorCount=1
|
|
)
|
|
|
|
async def _callImageWithModel(self, model: AiModel, prompt: str, imageData: Union[str, bytes], mimeType: str, inputBytes: int) -> AiCallResponse:
|
|
"""Call a specific model for image analysis and return the response."""
|
|
# Start timing
|
|
startTime = time.time()
|
|
|
|
# Call the model's function directly
|
|
if model.functionCall:
|
|
content = await model.functionCall(prompt, imageData, mimeType)
|
|
else:
|
|
raise ValueError(f"Model {model.name} has no function call defined")
|
|
|
|
# Calculate timing and output bytes
|
|
endTime = time.time()
|
|
processingTime = endTime - startTime
|
|
outputBytes = len(content.encode("utf-8"))
|
|
|
|
# Calculate price using model's own price calculation method
|
|
priceUsd = model.calculatePriceUsd(inputBytes, outputBytes)
|
|
|
|
return AiCallResponse(
|
|
content=content,
|
|
modelName=model.name,
|
|
priceUsd=priceUsd,
|
|
processingTime=processingTime,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=outputBytes,
|
|
errorCount=0
|
|
)
|
|
|
|
async def generateImage(self, prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", options: AiCallOptions = None) -> AiCallResponse:
|
|
"""Generate an image using AI."""
|
|
|
|
if options is None:
|
|
options = AiCallOptions(operationType=OperationTypeEnum.IMAGE_GENERATE)
|
|
|
|
# Calculate input bytes
|
|
inputBytes = len(prompt.encode("utf-8"))
|
|
|
|
try:
|
|
# Select the best model for image generation
|
|
modelName = self._selectModel(prompt, "", options)
|
|
selectedModel = modelRegistry.getModel(modelName)
|
|
|
|
if not selectedModel:
|
|
raise ValueError(f"Selected model {modelName} not found in registry")
|
|
|
|
# Get the connector for this model
|
|
connector = modelRegistry.getConnectorForModel(modelName)
|
|
if not connector:
|
|
raise ValueError(f"No connector found for model {modelName}")
|
|
|
|
# Start timing
|
|
startTime = time.time()
|
|
|
|
# Call the model's function directly
|
|
if selectedModel.functionCall:
|
|
result = await selectedModel.functionCall(prompt, size, quality, style)
|
|
content = str(result)
|
|
else:
|
|
raise ValueError(f"Model {modelName} has no function call defined")
|
|
|
|
# Calculate timing and output bytes
|
|
endTime = time.time()
|
|
processingTime = endTime - startTime
|
|
outputBytes = len(content.encode("utf-8"))
|
|
|
|
# Calculate price using model's own price calculation method
|
|
priceUsd = selectedModel.calculatePriceUsd(inputBytes, outputBytes)
|
|
|
|
logger.info(f"✅ Image generation successful with model: {modelName}")
|
|
return AiCallResponse(
|
|
success=True,
|
|
content=content,
|
|
model=modelName,
|
|
processingTime=processingTime,
|
|
priceUsd=priceUsd,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=outputBytes,
|
|
errorCount=0
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Image generation failed with model {modelName}: {str(e)}")
|
|
return AiCallResponse(
|
|
content=f"Image generation failed: {str(e)}",
|
|
modelName=modelName,
|
|
priceUsd=0.0,
|
|
processingTime=0.0,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=0,
|
|
errorCount=1
|
|
)
|
|
|
|
# Web functionality methods - Simple interface to Tavily connector
|
|
async def searchWebsites(self, query: str, maxResults: int = 5, **kwargs) -> List[WebSearchResultItem]:
|
|
"""Search for websites using Tavily."""
|
|
request = WebSearchRequest(
|
|
query=query,
|
|
max_results=maxResults,
|
|
**kwargs
|
|
)
|
|
# Get Tavily connector from registry
|
|
tavilyConnector = modelRegistry.getConnectorForModel("tavily_search")
|
|
if not tavilyConnector:
|
|
raise ValueError("Tavily connector not available")
|
|
result = await tavilyConnector.search(request)
|
|
|
|
if result.success and result.documents:
|
|
return result.documents[0].documentData.results
|
|
return []
|
|
|
|
async def crawlWebsites(self, urls: List[str], extractDepth: str = "advanced", format: str = "markdown") -> List[WebCrawlResultItem]:
|
|
"""Crawl websites using Tavily."""
|
|
from pydantic import HttpUrl
|
|
from urllib.parse import urlparse
|
|
|
|
# Safely create HttpUrl objects with proper scheme handling
|
|
httpUrls = []
|
|
for url in urls:
|
|
try:
|
|
# Ensure URL has a scheme
|
|
parsed = urlparse(url)
|
|
if not parsed.scheme:
|
|
url = f"https://{url}"
|
|
|
|
# Use HttpUrl with scheme parameter (this works for all URLs)
|
|
httpUrls.append(HttpUrl(url, scheme="https"))
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Skipping invalid URL {url}: {e}")
|
|
continue
|
|
|
|
if not httpUrls:
|
|
return []
|
|
|
|
request = WebCrawlRequest(
|
|
urls=httpUrls,
|
|
extract_depth=extractDepth,
|
|
format=format
|
|
)
|
|
# Get Tavily connector from registry
|
|
tavilyConnector = modelRegistry.getConnectorForModel("tavily_crawl")
|
|
if not tavilyConnector:
|
|
raise ValueError("Tavily connector not available")
|
|
result = await tavilyConnector.crawl(request)
|
|
|
|
if result.success and result.documents:
|
|
return result.documents[0].documentData.results
|
|
return []
|
|
|
|
async def extractContent(self, urls: List[str], extractDepth: str = "advanced", format: str = "markdown") -> Dict[str, str]:
|
|
"""Extract content from URLs and return as dictionary."""
|
|
crawlResults = await self.crawlWebsites(urls, extractDepth, format)
|
|
return {str(result.url): result.content for result in crawlResults}
|
|
|
|
# Core Web Tools - Clean interface for web operations
|
|
async def readPage(self, url: str, extractDepth: str = "advanced") -> Optional[str]:
|
|
"""Read a single web page and return its content (HTML/Markdown)."""
|
|
logger.debug(f"Reading page: {url}")
|
|
try:
|
|
# URL encode the URL to handle spaces and special characters
|
|
from urllib.parse import quote, urlparse, urlunparse
|
|
parsed = urlparse(url)
|
|
encodedUrl = urlunparse((
|
|
parsed.scheme,
|
|
parsed.netloc,
|
|
parsed.path,
|
|
parsed.params,
|
|
parsed.query,
|
|
parsed.fragment
|
|
))
|
|
|
|
# Manually encode query parameters to handle spaces
|
|
if parsed.query:
|
|
encodedQuery = quote(parsed.query, safe='=&')
|
|
encodedUrl = urlunparse((
|
|
parsed.scheme,
|
|
parsed.netloc,
|
|
parsed.path,
|
|
parsed.params,
|
|
encodedQuery,
|
|
parsed.fragment
|
|
))
|
|
|
|
logger.debug(f"URL encoded: {url} -> {encodedUrl}")
|
|
|
|
content = await self.extractContent([encodedUrl], extractDepth, "markdown")
|
|
result = content.get(encodedUrl)
|
|
if result:
|
|
logger.debug(f"Successfully read page {encodedUrl}: {len(result)} chars")
|
|
else:
|
|
logger.warning(f"No content returned for page {encodedUrl}")
|
|
return result
|
|
except Exception as e:
|
|
logger.warning(f"Failed to read page {url}: {e}")
|
|
return None
|
|
|
|
async def getUrlsFromPage(self, url: str, extractDepth: str = "advanced") -> List[str]:
|
|
"""Get all URLs from a web page, with redundancies removed."""
|
|
try:
|
|
content = await self.readPage(url, extractDepth)
|
|
if not content:
|
|
return []
|
|
|
|
links = self._extractLinksFromContent(content, url)
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
uniqueLinks = []
|
|
for link in links:
|
|
if link not in seen:
|
|
seen.add(link)
|
|
uniqueLinks.append(link)
|
|
|
|
logger.debug(f"Extracted {len(uniqueLinks)} unique URLs from {url}")
|
|
return uniqueLinks
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to get URLs from page {url}: {e}")
|
|
return []
|
|
|
|
def filterUrlsOnlyPages(self, urls: List[str], maxPerDomain: int = 10) -> List[str]:
|
|
"""Filter URLs to get only links for pages to follow (no images, etc.)."""
|
|
from urllib.parse import urlparse
|
|
|
|
def _isHtmlCandidate(url: str) -> bool:
|
|
lower = url.lower()
|
|
blocked = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp',
|
|
'.mp4', '.mp3', '.avi', '.mov', '.mkv',
|
|
'.pdf', '.zip', '.rar', '.7z', '.tar', '.gz',
|
|
'.css', '.js', '.woff', '.woff2', '.ttf', '.eot')
|
|
return not lower.endswith(blocked)
|
|
|
|
# Group by domain
|
|
domainLinks = {}
|
|
for link in urls:
|
|
domain = urlparse(link).netloc
|
|
if domain not in domainLinks:
|
|
domainLinks[domain] = []
|
|
domainLinks[domain].append(link)
|
|
|
|
# Filter and cap per domain
|
|
filteredLinks = []
|
|
for domain, domainLinkList in domainLinks.items():
|
|
seen = set()
|
|
domainFiltered = []
|
|
|
|
for link in domainLinkList:
|
|
if link in seen:
|
|
continue
|
|
if not _isHtmlCandidate(link):
|
|
continue
|
|
seen.add(link)
|
|
domainFiltered.append(link)
|
|
if len(domainFiltered) >= maxPerDomain:
|
|
break
|
|
|
|
filteredLinks.extend(domainFiltered)
|
|
logger.debug(f"Domain {domain}: {len(domainLinkList)} -> {len(domainFiltered)} links")
|
|
|
|
return filteredLinks
|
|
|
|
def _extractLinksFromContent(self, content: str, baseUrl: str) -> List[str]:
|
|
"""Extract links from HTML/Markdown content."""
|
|
try:
|
|
import re
|
|
from urllib.parse import urljoin, urlparse, quote, urlunparse
|
|
|
|
def _cleanUrl(url: str) -> str:
|
|
"""Clean and encode URL to remove spaces and invalid characters."""
|
|
# Remove quotes and extra spaces
|
|
url = url.strip().strip('"\'')
|
|
|
|
# If it's a relative URL, make it absolute first
|
|
if not url.startswith(('http://', 'https://')):
|
|
url = urljoin(baseUrl, url)
|
|
|
|
# Parse and re-encode the URL properly
|
|
parsed = urlparse(url)
|
|
if parsed.query:
|
|
# Encode query parameters properly
|
|
encodedQuery = quote(parsed.query, safe='=&')
|
|
url = urlunparse((
|
|
parsed.scheme,
|
|
parsed.netloc,
|
|
parsed.path,
|
|
parsed.params,
|
|
encodedQuery,
|
|
parsed.fragment
|
|
))
|
|
|
|
return url
|
|
|
|
links = []
|
|
|
|
# Extract HTML links: <a href="url"> format
|
|
htmlLinkPattern = r'<a[^>]+href=["\']([^"\']+)["\'][^>]*>'
|
|
htmlLinks = re.findall(htmlLinkPattern, content, re.IGNORECASE)
|
|
|
|
for url in htmlLinks:
|
|
if url and not url.startswith('#') and not url.startswith('javascript:'):
|
|
try:
|
|
cleanedUrl = _cleanUrl(url)
|
|
links.append(cleanedUrl)
|
|
logger.debug(f"Extracted HTML link: {url} -> {cleanedUrl}")
|
|
except Exception as e:
|
|
logger.debug(f"Failed to clean HTML link {url}: {e}")
|
|
|
|
# Extract markdown links: [text](url) format
|
|
markdownLinkPattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
|
markdownLinks = re.findall(markdownLinkPattern, content)
|
|
|
|
for text, url in markdownLinks:
|
|
if url and not url.startswith('#'):
|
|
try:
|
|
cleanedUrl = _cleanUrl(url)
|
|
# Only keep URLs from the same domain
|
|
if urlparse(cleanedUrl).netloc == urlparse(baseUrl).netloc:
|
|
links.append(cleanedUrl)
|
|
logger.debug(f"Extracted markdown link: {url} -> {cleanedUrl}")
|
|
except Exception as e:
|
|
logger.debug(f"Failed to clean markdown link {url}: {e}")
|
|
|
|
# Extract plain URLs in the text
|
|
urlPattern = r'https?://[^\s\)]+'
|
|
plainUrls = re.findall(urlPattern, content)
|
|
|
|
for url in plainUrls:
|
|
try:
|
|
cleanUrl = url.rstrip('.,;!?')
|
|
cleanedUrl = _cleanUrl(cleanUrl)
|
|
if urlparse(cleanedUrl).netloc == urlparse(baseUrl).netloc:
|
|
if cleanedUrl not in links: # Avoid duplicates
|
|
links.append(cleanedUrl)
|
|
logger.debug(f"Extracted plain URL: {url} -> {cleanedUrl}")
|
|
except Exception as e:
|
|
logger.debug(f"Failed to clean plain URL {url}: {e}")
|
|
|
|
logger.debug(f"Total links extracted and cleaned: {len(links)}")
|
|
return links
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to extract links from content: {e}")
|
|
return []
|
|
|
|
def _normalizeUrl(self, url: str) -> str:
|
|
"""Normalize URL to handle variations that should be considered duplicates."""
|
|
if not url:
|
|
return url
|
|
|
|
# Remove trailing slashes and fragments
|
|
url = url.rstrip('/')
|
|
if '#' in url:
|
|
url = url.split('#')[0]
|
|
|
|
# Handle common URL variations
|
|
url = url.replace('http://', 'https://') # Normalize protocol
|
|
|
|
return url
|
|
|
|
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
|
|
"""
|
|
Recursively crawl URLs up to specified depth.
|
|
|
|
Args:
|
|
urls: List of starting URLs to crawl
|
|
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
|
|
extract_depth: Tavily extract depth setting
|
|
max_per_domain: Maximum URLs per domain per level
|
|
global_processed_urls: Optional global set to track processed URLs across sessions
|
|
|
|
Returns:
|
|
Dictionary mapping URL -> content for all crawled pages
|
|
"""
|
|
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
|
|
|
|
# URL index to track all processed URLs (local + global)
|
|
processed_urls = set()
|
|
if global_processed_urls is not None:
|
|
# Use global index if provided, otherwise create local one
|
|
processed_urls = global_processed_urls
|
|
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
|
|
else:
|
|
logger.info("Using local URL index for this crawl session")
|
|
|
|
all_content = {}
|
|
|
|
# Current level URLs to process
|
|
current_level_urls = urls.copy()
|
|
|
|
try:
|
|
for depth in range(1, max_depth + 1):
|
|
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
|
|
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
|
|
|
|
# URLs found at this level (for next iteration)
|
|
next_level_urls = []
|
|
|
|
for url in current_level_urls:
|
|
# Normalize URL for duplicate checking
|
|
normalized_url = self._normalizeUrl(url)
|
|
if normalized_url in processed_urls:
|
|
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
|
|
continue
|
|
|
|
try:
|
|
logger.info(f"Processing URL at depth {depth}: {url}")
|
|
logger.debug(f"Total processed URLs so far: {len(processed_urls)}")
|
|
|
|
# Read page content
|
|
content = await self.readPage(url, extract_depth)
|
|
if content:
|
|
all_content[url] = content
|
|
processed_urls.add(normalized_url)
|
|
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
|
|
|
|
# Get URLs from this page for next level
|
|
page_urls = await self.getUrlsFromPage(url, extract_depth)
|
|
logger.info(f"Found {len(page_urls)} URLs on {url}")
|
|
|
|
# Filter URLs and add to next level
|
|
filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain)
|
|
logger.info(f"Filtered to {len(filtered_urls)} valid URLs")
|
|
|
|
# Add new URLs to next level (avoiding already processed ones)
|
|
new_urls_count = 0
|
|
for new_url in filtered_urls:
|
|
normalized_new_url = self._normalizeUrl(new_url)
|
|
if normalized_new_url not in processed_urls:
|
|
next_level_urls.append(new_url)
|
|
new_urls_count += 1
|
|
else:
|
|
logger.debug(f"URL {new_url} (normalized: {normalized_new_url}) already processed, skipping")
|
|
|
|
logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
|
|
else:
|
|
logger.warning(f"✗ No content extracted from {url}")
|
|
processed_urls.add(normalized_url) # Mark as processed to avoid retry
|
|
|
|
except Exception as e:
|
|
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
|
|
processed_urls.add(normalized_url) # Mark as processed to avoid retry
|
|
|
|
# Prepare for next iteration
|
|
current_level_urls = next_level_urls
|
|
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
|
|
|
|
# Stop if no more URLs to process
|
|
if not current_level_urls:
|
|
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
|
|
break
|
|
|
|
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
|
logger.info(f"Total URLs processed (including skipped): {len(processed_urls)}")
|
|
logger.info(f"Unique URLs found: {len(all_content)}")
|
|
return all_content
|
|
|
|
except asyncio.TimeoutError:
|
|
logger.warning(f"Crawling timed out, returning partial results: {len(all_content)} pages crawled so far")
|
|
return all_content
|
|
except Exception as e:
|
|
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
|
|
return all_content
|
|
|
|
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> AiCallResponse:
|
|
"""Use Perplexity AI to provide the best answers for web-related queries."""
|
|
|
|
if options is None:
|
|
options = AiCallOptions(operationType=OperationTypeEnum.WEB_RESEARCH)
|
|
|
|
# Calculate input bytes
|
|
inputBytes = len((query + context).encode("utf-8"))
|
|
|
|
# Create a comprehensive prompt for web queries
|
|
webPrompt = f"""You are an expert web researcher and information analyst. Please provide a comprehensive and accurate answer to the following web-related query.
|
|
|
|
Query: {query}
|
|
|
|
{f"Additional Context: {context}" if context else ""}
|
|
|
|
Please provide:
|
|
1. A clear, well-structured answer to the query
|
|
2. Key points and important details
|
|
3. Relevant insights and analysis
|
|
4. Any important considerations or caveats
|
|
5. Suggestions for further research if applicable
|
|
|
|
Format your response in a clear, professional manner that would be helpful for someone researching this topic."""
|
|
|
|
try:
|
|
# Start timing
|
|
startTime = time.time()
|
|
|
|
# Use Perplexity for web research with search capabilities
|
|
perplexity_connector = modelRegistry.getConnectorForModel("perplexity_callAiWithWebSearch")
|
|
if not perplexity_connector:
|
|
raise ValueError("Perplexity connector not available")
|
|
response = await perplexity_connector.callAiWithWebSearch(webPrompt)
|
|
|
|
# Calculate timing and output bytes
|
|
endTime = time.time()
|
|
processingTime = endTime - startTime
|
|
outputBytes = len(response.encode("utf-8"))
|
|
|
|
# Calculate price using Perplexity model pricing
|
|
perplexity_model = modelRegistry.getModel("perplexity_callAiWithWebSearch")
|
|
priceUsd = perplexity_model.calculatePriceUsd(inputBytes, outputBytes)
|
|
|
|
logger.info(f"✅ Web query successful with Perplexity")
|
|
return AiCallResponse(
|
|
content=response,
|
|
modelName="perplexity_callAiWithWebSearch",
|
|
priceUsd=priceUsd,
|
|
processingTime=processingTime,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=outputBytes,
|
|
errorCount=0
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Perplexity web query failed: {str(e)}")
|
|
return AiCallResponse(
|
|
content=f"Web query failed: {str(e)}",
|
|
modelName="perplexity_callAiWithWebSearch",
|
|
priceUsd=0.0,
|
|
processingTime=0.0,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=0,
|
|
errorCount=1
|
|
)
|
|
|
|
# Utility methods
|
|
async def listAvailableModels(self, connectorType: str = None) -> List[Dict[str, Any]]:
|
|
"""List available models, optionally filtered by connector type."""
|
|
models = modelRegistry.getAvailableModels()
|
|
if connectorType:
|
|
return [model.dict() for model in models if model.connectorType == connectorType]
|
|
return [model.dict() for model in models]
|
|
|
|
async def getModelInfo(self, modelName: str) -> Dict[str, Any]:
|
|
"""Get information about a specific model."""
|
|
model = modelRegistry.getModel(modelName)
|
|
if not model:
|
|
raise ValueError(f"Model {modelName} not found")
|
|
return model.dict()
|
|
|
|
async def getModelsByCapability(self, capability: str) -> List[str]:
|
|
"""Get model names that support a specific capability."""
|
|
models = modelRegistry.getModelsByCapability(capability)
|
|
return [model.name for model in models]
|
|
|
|
async def getModelsByTag(self, tag: str) -> List[str]:
|
|
"""Get model names that have a specific tag."""
|
|
models = modelRegistry.getModelsByTag(tag)
|
|
return [model.name for model in models]
|
|
|
|
async def selectRelevantWebsites(self, websites: List[str], userQuestion: str) -> Tuple[List[str], str]:
|
|
"""Select most relevant websites using AI analysis. Returns (selected_websites, ai_response)."""
|
|
if len(websites) <= 1:
|
|
return websites, "Only one website available, no selection needed"
|
|
|
|
try:
|
|
# Create website summaries for AI analysis
|
|
websiteSummaries = []
|
|
for i, url in enumerate(websites, 1):
|
|
from urllib.parse import urlparse
|
|
domain = urlparse(url).netloc
|
|
summary = f"{i}. {url} (Domain: {domain})"
|
|
websiteSummaries.append(summary)
|
|
|
|
selectionPrompt = f"""
|
|
Based on this user request: "{userQuestion}"
|
|
|
|
I have {len(websites)} websites found. Please select the most relevant website(s) for this request.
|
|
|
|
Available websites:
|
|
{chr(10).join(websiteSummaries)}
|
|
|
|
Please respond with the website number(s) (1, 2, 3, etc.) that are most relevant.
|
|
Format: 1,3,5 (or just 1 for single selection)
|
|
"""
|
|
|
|
# Use Perplexity to select the best websites
|
|
response = await self.webQuery(selectionPrompt)
|
|
|
|
# Parse the selection
|
|
import re
|
|
numbers = re.findall(r'\d+', response)
|
|
if numbers:
|
|
selectedWebsites = []
|
|
for num in numbers:
|
|
index = int(num) - 1
|
|
if 0 <= index < len(websites):
|
|
selectedWebsites.append(websites[index])
|
|
|
|
if selectedWebsites:
|
|
logger.info(f"AI selected {len(selectedWebsites)} websites")
|
|
return selectedWebsites, response
|
|
|
|
# Fallback to first website
|
|
logger.warning("AI selection failed, using first website")
|
|
return websites[:1], f"AI selection failed, fallback to first website. AI response: {response}"
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in website selection: {str(e)}")
|
|
return websites[:1], f"Error in website selection: {str(e)}"
|
|
|
|
async def analyzeContentWithChunking(self, allContent: Dict[str, str], userQuestion: str) -> str:
|
|
"""Analyze content using AI with chunking for large content."""
|
|
logger.info(f"Analyzing {len(allContent)} websites with AI")
|
|
|
|
# Process content in chunks to avoid token limits
|
|
chunkSize = 50000 # 50k chars per chunk
|
|
allChunks = []
|
|
|
|
for url, content in allContent.items():
|
|
filteredContent = self._filterContent(content)
|
|
if len(filteredContent) <= chunkSize:
|
|
allChunks.append((url, filteredContent))
|
|
logger.info(f"Content from {url}: {len(filteredContent)} chars (single chunk)")
|
|
else:
|
|
# Split large content into chunks
|
|
chunkCount = (len(filteredContent) + chunkSize - 1) // chunkSize
|
|
logger.info(f"Content from {url}: {len(filteredContent)} chars (split into {chunkCount} chunks)")
|
|
for i in range(0, len(filteredContent), chunkSize):
|
|
chunk = filteredContent[i:i+chunkSize]
|
|
chunkNum = i//chunkSize + 1
|
|
allChunks.append((f"{url} (part {chunkNum})", chunk))
|
|
|
|
logger.info(f"Processing {len(allChunks)} content chunks")
|
|
|
|
# Analyze each chunk
|
|
chunkAnalyses = []
|
|
for i, (url, chunk) in enumerate(allChunks, 1):
|
|
logger.info(f"Analyzing chunk {i}/{len(allChunks)}: {url}")
|
|
|
|
try:
|
|
analysisPrompt = f"""
|
|
Analyze this web content and extract relevant information for: {userQuestion}
|
|
|
|
Source: {url}
|
|
Content: {chunk}
|
|
|
|
Please extract key information relevant to the query.
|
|
"""
|
|
|
|
analysis = await self.webQuery(analysisPrompt)
|
|
chunkAnalyses.append(analysis)
|
|
logger.info(f"Chunk {i}/{len(allChunks)} analyzed successfully")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Chunk {i}/{len(allChunks)} error: {e}")
|
|
|
|
# Combine all chunk analyses
|
|
if chunkAnalyses:
|
|
logger.info(f"Combining {len(chunkAnalyses)} chunk analyses")
|
|
combinedAnalysis = "\n\n".join(chunkAnalyses)
|
|
|
|
# Final synthesis
|
|
try:
|
|
logger.info("Performing final synthesis of all analyses")
|
|
synthesisPrompt = f"""
|
|
Based on these partial analyses, provide a comprehensive answer to: {userQuestion}
|
|
|
|
Partial analyses:
|
|
{combinedAnalysis}
|
|
|
|
Please provide a clear, well-structured answer to the query.
|
|
"""
|
|
|
|
finalAnalysis = await self.webQuery(synthesisPrompt)
|
|
logger.info("Final synthesis completed successfully")
|
|
return finalAnalysis
|
|
|
|
except Exception as e:
|
|
logger.error(f"Synthesis error: {e}")
|
|
return combinedAnalysis
|
|
else:
|
|
logger.error("No content could be analyzed")
|
|
return "No content could be analyzed"
|
|
|
|
def _filterContent(self, content: str) -> str:
|
|
"""Filter out navigation, ads, and other nonsense content."""
|
|
lines = content.split('\n')
|
|
filteredLines = []
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
# Skip empty lines
|
|
if not line:
|
|
continue
|
|
# Skip navigation elements
|
|
if any(skip in line.lower() for skip in [
|
|
'toggle navigation', 'log in', 'sign up', 'cookies', 'privacy policy',
|
|
'terms of service', 'subscribe', 'newsletter', 'follow us', 'share this',
|
|
'advertisement', 'sponsored', 'banner', 'popup', 'modal'
|
|
]):
|
|
continue
|
|
# Skip image references without context
|
|
if line.startswith(' and line.endswith(')') and '---' in line:
|
|
continue
|
|
# Keep meaningful content
|
|
if len(line) > 10: # Skip very short lines
|
|
filteredLines.append(line)
|
|
|
|
return '\n'.join(filteredLines)
|
|
|