web service parallelized
This commit is contained in:
parent
0d77263fb7
commit
cb7ed7cf51
3 changed files with 285 additions and 163 deletions
|
|
@ -122,10 +122,14 @@ class AiCallLooper:
|
|||
)
|
||||
|
||||
# Write the ACTUAL prompt sent to AI
|
||||
if iteration == 1:
|
||||
self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt")
|
||||
else:
|
||||
self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt_iteration_{iteration}")
|
||||
# For section content generation: only write one prompt file (first iteration)
|
||||
# For document generation: write prompt for each iteration
|
||||
isSectionContent = "_section_" in debugPrefix
|
||||
if iteration == 1 or not isSectionContent:
|
||||
if iteration == 1:
|
||||
self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt")
|
||||
elif not isSectionContent:
|
||||
self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt_iteration_{iteration}")
|
||||
|
||||
response = await self.aiService.callAi(request)
|
||||
result = response.content
|
||||
|
|
@ -146,10 +150,13 @@ class AiCallLooper:
|
|||
self.services.chat.progressLogUpdate(iterationOperationId, 0.6, f"AI response received ({bytesDisplay})")
|
||||
|
||||
# Write raw AI response to debug file
|
||||
if iteration == 1:
|
||||
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response")
|
||||
else:
|
||||
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response_iteration_{iteration}")
|
||||
# For section content generation: only write one response file (first iteration)
|
||||
# For document generation: write response for each iteration
|
||||
if iteration == 1 or not isSectionContent:
|
||||
if iteration == 1:
|
||||
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response")
|
||||
elif not isSectionContent:
|
||||
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response_iteration_{iteration}")
|
||||
|
||||
# Emit stats for this iteration (only if workflow exists and has id)
|
||||
if self.services.workflow and hasattr(self.services.workflow, 'id') and self.services.workflow.id:
|
||||
|
|
@ -219,9 +226,9 @@ class AiCallLooper:
|
|||
logger.info(f"Iteration {iteration}: Section content generation detected (elements found), returning JSON directly")
|
||||
if iterationOperationId:
|
||||
self.services.chat.progressLogFinish(iterationOperationId, True)
|
||||
# Write final result
|
||||
# Note: Debug files (_prompt and _response) are already written above for iteration 1
|
||||
# No need to write _final_result as it's redundant with _response
|
||||
final_json = json.dumps(parsedJsonForSection, indent=2, ensure_ascii=False) if parsedJsonForSection else (extractedJsonForSection or result)
|
||||
self.services.utils.writeDebugFile(final_json, f"{debugPrefix}_final_result")
|
||||
return final_json
|
||||
|
||||
# Extract sections from response (handles both valid and broken JSON)
|
||||
|
|
@ -397,7 +404,10 @@ class AiCallLooper:
|
|||
self.services.chat.progressLogUpdate(operationId, estimatedProgress, f"Pipeline: {bytesDisplay} (iteration {iteration})")
|
||||
|
||||
# Log merged sections for debugging
|
||||
self.services.utils.writeDebugFile(merged_json_str, f"{debugPrefix}_merged_sections_iteration_{iteration}")
|
||||
# For section content generation: skip merged sections debug files (only one prompt/response needed)
|
||||
isSectionContent = "_section_" in debugPrefix
|
||||
if not isSectionContent:
|
||||
self.services.utils.writeDebugFile(merged_json_str, f"{debugPrefix}_merged_sections_iteration_{iteration}")
|
||||
|
||||
# Check if we should continue (completion detection)
|
||||
# Simple logic: JSON completeness determines continuation
|
||||
|
|
@ -465,7 +475,10 @@ class AiCallLooper:
|
|||
final_result = self.responseParser.buildFinalResultFromSections(allSections, documentMetadata)
|
||||
|
||||
# Write final result to debug file
|
||||
self.services.utils.writeDebugFile(final_result, f"{debugPrefix}_final_result")
|
||||
# For section content generation: skip final_result debug file (response already written)
|
||||
isSectionContent = "_section_" in debugPrefix
|
||||
if not isSectionContent:
|
||||
self.services.utils.writeDebugFile(final_result, f"{debugPrefix}_final_result")
|
||||
|
||||
return final_result
|
||||
|
||||
|
|
|
|||
|
|
@ -537,11 +537,6 @@ class StructureFiller:
|
|||
|
||||
try:
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt")
|
||||
self.services.utils.writeDebugFile(
|
||||
generationPrompt,
|
||||
f"{chapterId}_section_{sectionId}_prompt"
|
||||
)
|
||||
logger.debug(f"Logged section prompt: {chapterId}_section_{sectionId}_prompt (aggregation)")
|
||||
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
|
||||
|
||||
|
|
@ -553,6 +548,12 @@ class StructureFiller:
|
|||
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
|
||||
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0]
|
||||
|
||||
# Write debug file for IMAGE_GENERATE (direct callAi, no _callAiWithLooping)
|
||||
self.services.utils.writeDebugFile(
|
||||
generationPrompt,
|
||||
f"{chapterId}_section_{sectionId}_prompt"
|
||||
)
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=generationPrompt,
|
||||
contentParts=[],
|
||||
|
|
@ -564,6 +565,12 @@ class StructureFiller:
|
|||
)
|
||||
aiResponse = await self.aiService.callAi(request)
|
||||
generatedElements = []
|
||||
|
||||
# Write debug file for IMAGE_GENERATE response (direct callAi, no _callAiWithLooping)
|
||||
self.services.utils.writeDebugFile(
|
||||
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
|
||||
f"{chapterId}_section_{sectionId}_response"
|
||||
)
|
||||
else:
|
||||
async def buildSectionPromptWithContinuation(
|
||||
section: Dict[str, Any],
|
||||
|
|
@ -665,11 +672,7 @@ The JSON should be a fragment that can be merged with the previous response."""
|
|||
generatedElements = []
|
||||
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response")
|
||||
self.services.utils.writeDebugFile(
|
||||
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
|
||||
f"{chapterId}_section_{sectionId}_response"
|
||||
)
|
||||
logger.debug(f"Logged section response: {chapterId}_section_{sectionId}_response (aggregation)")
|
||||
# Note: Debug files are written by _callAiWithLooping using debugPrefix
|
||||
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
|
||||
|
||||
|
|
@ -735,11 +738,6 @@ The JSON should be a fragment that can be merged with the previous response."""
|
|||
|
||||
try:
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt")
|
||||
self.services.utils.writeDebugFile(
|
||||
generationPrompt,
|
||||
f"{chapterId}_section_{sectionId}_prompt"
|
||||
)
|
||||
logger.debug(f"Logged section prompt: {chapterId}_section_{sectionId}_prompt")
|
||||
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
|
||||
|
||||
|
|
@ -751,6 +749,12 @@ The JSON should be a fragment that can be merged with the previous response."""
|
|||
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
|
||||
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0]
|
||||
|
||||
# Write debug file for IMAGE_GENERATE (direct callAi, no _callAiWithLooping)
|
||||
self.services.utils.writeDebugFile(
|
||||
generationPrompt,
|
||||
f"{chapterId}_section_{sectionId}_prompt"
|
||||
)
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=generationPrompt,
|
||||
contentParts=[],
|
||||
|
|
@ -762,6 +766,12 @@ The JSON should be a fragment that can be merged with the previous response."""
|
|||
)
|
||||
aiResponse = await self.aiService.callAi(request)
|
||||
generatedElements = []
|
||||
|
||||
# Write debug file for IMAGE_GENERATE response (direct callAi, no _callAiWithLooping)
|
||||
self.services.utils.writeDebugFile(
|
||||
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
|
||||
f"{chapterId}_section_{sectionId}_response"
|
||||
)
|
||||
else:
|
||||
isAggregation = False
|
||||
|
||||
|
|
@ -865,11 +875,7 @@ The JSON should be a fragment that can be merged with the previous response."""
|
|||
generatedElements = []
|
||||
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response")
|
||||
self.services.utils.writeDebugFile(
|
||||
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
|
||||
f"{chapterId}_section_{sectionId}_response"
|
||||
)
|
||||
logger.debug(f"Logged section response: {chapterId}_section_{sectionId}_response")
|
||||
# Note: Debug files are written by _callAiWithLooping using debugPrefix
|
||||
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
|
||||
|
||||
|
|
@ -968,11 +974,6 @@ The JSON should be a fragment that can be merged with the previous response."""
|
|||
|
||||
try:
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt")
|
||||
self.services.utils.writeDebugFile(
|
||||
generationPrompt,
|
||||
f"{chapterId}_section_{sectionId}_prompt"
|
||||
)
|
||||
logger.debug(f"Logged section prompt: {chapterId}_section_{sectionId}_prompt")
|
||||
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
|
||||
|
||||
|
|
@ -984,6 +985,12 @@ The JSON should be a fragment that can be merged with the previous response."""
|
|||
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
|
||||
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0]
|
||||
|
||||
# Write debug file for IMAGE_GENERATE (direct callAi, no _callAiWithLooping)
|
||||
self.services.utils.writeDebugFile(
|
||||
generationPrompt,
|
||||
f"{chapterId}_section_{sectionId}_prompt"
|
||||
)
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=generationPrompt,
|
||||
contentParts=[],
|
||||
|
|
@ -995,6 +1002,12 @@ The JSON should be a fragment that can be merged with the previous response."""
|
|||
)
|
||||
aiResponse = await self.aiService.callAi(request)
|
||||
generatedElements = []
|
||||
|
||||
# Write debug file for IMAGE_GENERATE response (direct callAi, no _callAiWithLooping)
|
||||
self.services.utils.writeDebugFile(
|
||||
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
|
||||
f"{chapterId}_section_{sectionId}_response"
|
||||
)
|
||||
else:
|
||||
isAggregation = False
|
||||
|
||||
|
|
@ -1098,11 +1111,7 @@ The JSON should be a fragment that can be merged with the previous response."""
|
|||
generatedElements = []
|
||||
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response")
|
||||
self.services.utils.writeDebugFile(
|
||||
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
|
||||
f"{chapterId}_section_{sectionId}_response"
|
||||
)
|
||||
logger.debug(f"Logged section response: {chapterId}_section_{sectionId}_response")
|
||||
# Note: Debug files are written by _callAiWithLooping using debugPrefix
|
||||
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
|
||||
|
||||
|
|
|
|||
|
|
@ -8,6 +8,8 @@ Manages the two-step process: WEB_SEARCH then WEB_CRAWL.
|
|||
import json
|
||||
import logging
|
||||
import time
|
||||
import asyncio
|
||||
from urllib.parse import urlparse
|
||||
from typing import Dict, Any, List, Optional
|
||||
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallPromptWebSearch, AiCallPromptWebCrawl
|
||||
|
||||
|
|
@ -99,12 +101,18 @@ class WebService:
|
|||
|
||||
self.services.chat.progressLogUpdate(operationId, 0.5, f"Found {len(allUrls)} total URLs")
|
||||
|
||||
# Step 3: Filter to maxNumberPages (simple cut, no intelligent filtering)
|
||||
if len(allUrls) > maxNumberPages:
|
||||
allUrls = allUrls[:maxNumberPages]
|
||||
# Step 3: Validate and filter URLs before crawling
|
||||
validatedUrls = self._validateUrls(allUrls)
|
||||
if not validatedUrls:
|
||||
logger.warning(f"All {len(allUrls)} URLs failed validation")
|
||||
return {"error": "No valid URLs found to crawl"}
|
||||
|
||||
# Filter to maxNumberPages (simple cut, no intelligent filtering)
|
||||
if len(validatedUrls) > maxNumberPages:
|
||||
validatedUrls = validatedUrls[:maxNumberPages]
|
||||
logger.info(f"Limited URLs to {maxNumberPages}")
|
||||
|
||||
if not allUrls:
|
||||
if not validatedUrls:
|
||||
return {"error": "No URLs found to crawl"}
|
||||
|
||||
# Step 4: Translate researchDepth to maxDepth
|
||||
|
|
@ -114,14 +122,14 @@ class WebService:
|
|||
# Step 5: Crawl all URLs with hierarchical logging
|
||||
if operationId:
|
||||
self.services.chat.progressLogUpdate(operationId, 0.4, "Initiating")
|
||||
self.services.chat.progressLogUpdate(operationId, 0.6, f"Crawling {len(allUrls)} URLs")
|
||||
self.services.chat.progressLogUpdate(operationId, 0.6, f"Crawling {len(validatedUrls)} URLs")
|
||||
|
||||
# Use parent operation ID directly (parentId should be operationId, not log entry ID)
|
||||
parentOperationId = operationId # Use the parent's operationId directly
|
||||
|
||||
crawlResult = await self._performWebCrawl(
|
||||
instruction=instruction,
|
||||
urls=allUrls,
|
||||
urls=validatedUrls,
|
||||
maxDepth=maxDepth,
|
||||
parentOperationId=parentOperationId
|
||||
)
|
||||
|
|
@ -194,24 +202,24 @@ class WebService:
|
|||
"max_depth": maxDepth,
|
||||
"country": countryCode,
|
||||
"language": languageCode,
|
||||
"urls_crawled": allUrls[:20], # First 20 URLs for reference
|
||||
"total_urls": len(allUrls),
|
||||
"urls_crawled": validatedUrls[:20], # First 20 URLs for reference
|
||||
"total_urls": len(validatedUrls),
|
||||
"urls_with_content": urlsWithContent,
|
||||
"total_content_length": totalContentLength,
|
||||
"crawl_date": self.services.utils.timestampGetUtc() if hasattr(self.services, 'utils') else None
|
||||
},
|
||||
"sections": sections,
|
||||
"statistics": {
|
||||
"sectionCount": len(sections),
|
||||
"total_urls": len(allUrls),
|
||||
"sectionCount": len(sections),
|
||||
"total_urls": len(validatedUrls),
|
||||
"results_count": totalResults,
|
||||
"urls_with_content": urlsWithContent,
|
||||
"total_content_length": totalContentLength
|
||||
},
|
||||
# Keep original structure for backward compatibility
|
||||
"instruction": instruction,
|
||||
"urls_crawled": allUrls,
|
||||
"total_urls": len(allUrls),
|
||||
"urls_crawled": validatedUrls,
|
||||
"total_urls": len(validatedUrls),
|
||||
"results": crawlResult,
|
||||
"total_results": totalResults
|
||||
}
|
||||
|
|
@ -383,6 +391,50 @@ Return ONLY valid JSON, no additional text:
|
|||
logger.error(f"Error in web search: {str(e)}")
|
||||
return []
|
||||
|
||||
def _validateUrls(self, urls: List[str]) -> List[str]:
|
||||
"""
|
||||
Validate URLs before crawling - filters out invalid URLs.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to validate
|
||||
|
||||
Returns:
|
||||
List of valid URLs
|
||||
"""
|
||||
validatedUrls = []
|
||||
for url in urls:
|
||||
if not url or not isinstance(url, str):
|
||||
logger.debug(f"Skipping invalid URL (not a string): {url}")
|
||||
continue
|
||||
|
||||
url = url.strip()
|
||||
if not url:
|
||||
logger.debug(f"Skipping empty URL")
|
||||
continue
|
||||
|
||||
# Basic URL validation using urlparse
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
# Check if URL has at least scheme and netloc
|
||||
if not parsed.scheme or not parsed.netloc:
|
||||
logger.debug(f"Skipping invalid URL (missing scheme or netloc): {url}")
|
||||
continue
|
||||
|
||||
# Only allow http/https schemes
|
||||
if parsed.scheme not in ['http', 'https']:
|
||||
logger.debug(f"Skipping URL with unsupported scheme '{parsed.scheme}': {url}")
|
||||
continue
|
||||
|
||||
validatedUrls.append(url)
|
||||
logger.debug(f"Validated URL: {url}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error validating URL '{url}': {str(e)}")
|
||||
continue
|
||||
|
||||
logger.info(f"Validated {len(validatedUrls)}/{len(urls)} URLs")
|
||||
return validatedUrls
|
||||
|
||||
async def _performWebCrawl(
|
||||
self,
|
||||
instruction: str,
|
||||
|
|
@ -390,117 +442,165 @@ Return ONLY valid JSON, no additional text:
|
|||
maxDepth: int = 2,
|
||||
parentOperationId: Optional[str] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Perform web crawl on list of URLs - calls plugin for each URL individually."""
|
||||
crawlResults = []
|
||||
|
||||
# Loop over each URL and crawl one at a time
|
||||
"""Perform web crawl on list of URLs - crawls URLs in parallel for better performance."""
|
||||
# Create tasks for parallel crawling
|
||||
crawlTasks = []
|
||||
for urlIndex, url in enumerate(urls):
|
||||
# Create separate operation for each URL with parent reference
|
||||
urlOperationId = None
|
||||
if parentOperationId:
|
||||
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
||||
urlOperationId = f"web_crawl_url_{workflowId}_{urlIndex}_{int(time.time())}"
|
||||
self.services.chat.progressLogStart(
|
||||
urlOperationId,
|
||||
"Web Crawl",
|
||||
f"URL {urlIndex + 1}",
|
||||
url[:50] + "..." if len(url) > 50 else url,
|
||||
parentOperationId=parentOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
logger.info(f"Crawling URL {urlIndex + 1}/{len(urls)}: {url}")
|
||||
|
||||
if urlOperationId:
|
||||
displayUrl = url[:50] + "..." if len(url) > 50 else url
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.2, f"Crawling: {displayUrl}")
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.3, "Initiating crawl")
|
||||
|
||||
# Build crawl prompt model for single URL
|
||||
crawlPromptModel = AiCallPromptWebCrawl(
|
||||
instruction=instruction,
|
||||
url=url, # Single URL
|
||||
maxDepth=maxDepth,
|
||||
maxWidth=5 # Default: 5 pages per level
|
||||
)
|
||||
crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)
|
||||
|
||||
# Debug: persist crawl prompt (with URL identifier in content for clarity)
|
||||
debugPrompt = f"URL: {url}\n\n{crawlPrompt}"
|
||||
self.services.utils.writeDebugFile(debugPrompt, "webcrawl_prompt")
|
||||
|
||||
# Call AI with WEB_CRAWL operation
|
||||
crawlOptions = AiCallOptions(
|
||||
operationType=OperationTypeEnum.WEB_CRAWL,
|
||||
resultFormat="json"
|
||||
)
|
||||
|
||||
if urlOperationId:
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.4, "Calling crawl connector")
|
||||
|
||||
# Use unified callAiContent method with parentOperationId for hierarchical logging
|
||||
crawlResponse = await self.services.ai.callAiContent(
|
||||
prompt=crawlPrompt,
|
||||
options=crawlOptions,
|
||||
outputFormat="json",
|
||||
parentOperationId=urlOperationId # Pass URL operation ID as parent for sub-URL logging
|
||||
)
|
||||
|
||||
if urlOperationId:
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.7, "Processing crawl results")
|
||||
|
||||
# Extract content from AiResponse
|
||||
crawlResult = crawlResponse.content
|
||||
|
||||
# Debug: persist crawl response
|
||||
if isinstance(crawlResult, str):
|
||||
self.services.utils.writeDebugFile(crawlResult, "webcrawl_response")
|
||||
else:
|
||||
self.services.utils.writeDebugFile(json.dumps(crawlResult, indent=2), "webcrawl_response")
|
||||
|
||||
# Parse crawl result
|
||||
if isinstance(crawlResult, str):
|
||||
try:
|
||||
# Extract JSON from response (handles markdown code blocks)
|
||||
extractedJson = self.services.utils.jsonExtractString(crawlResult)
|
||||
crawlData = json.loads(extractedJson) if extractedJson else json.loads(crawlResult)
|
||||
except:
|
||||
crawlData = {"url": url, "content": crawlResult}
|
||||
else:
|
||||
crawlData = crawlResult
|
||||
|
||||
# Process crawl results and create hierarchical progress logging for sub-URLs
|
||||
if urlOperationId:
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.8, "Processing crawl results")
|
||||
|
||||
# Recursively process crawl results to find nested URLs and create child operations
|
||||
processedResults = self._processCrawlResultsWithHierarchy(crawlData, url, urlOperationId, maxDepth, 0)
|
||||
|
||||
# Count total URLs crawled (including sub-URLs) for progress message
|
||||
totalUrlsCrawled = self._countUrlsInResults(processedResults)
|
||||
|
||||
# Ensure it's a list of results
|
||||
if isinstance(processedResults, list):
|
||||
crawlResults.extend(processedResults)
|
||||
elif isinstance(processedResults, dict):
|
||||
crawlResults.append(processedResults)
|
||||
else:
|
||||
crawlResults.append({"url": url, "content": str(processedResults)})
|
||||
|
||||
if urlOperationId:
|
||||
if totalUrlsCrawled > 1:
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.9, f"Crawled {totalUrlsCrawled} URLs (including sub-URLs)")
|
||||
else:
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.9, "Crawl completed")
|
||||
self.services.chat.progressLogFinish(urlOperationId, True)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error crawling URL {url}: {str(e)}")
|
||||
if urlOperationId:
|
||||
self.services.chat.progressLogFinish(urlOperationId, False)
|
||||
crawlResults.append({"url": url, "error": str(e)})
|
||||
task = self._crawlSingleUrl(
|
||||
url=url,
|
||||
urlIndex=urlIndex,
|
||||
totalUrls=len(urls),
|
||||
instruction=instruction,
|
||||
maxDepth=maxDepth,
|
||||
parentOperationId=parentOperationId
|
||||
)
|
||||
crawlTasks.append(task)
|
||||
|
||||
return crawlResults
|
||||
# Execute all crawl tasks in parallel
|
||||
logger.info(f"Starting parallel crawl of {len(urls)} URLs")
|
||||
crawlResults = await asyncio.gather(*crawlTasks, return_exceptions=True)
|
||||
|
||||
# Process results and handle exceptions
|
||||
processedResults = []
|
||||
for idx, result in enumerate(crawlResults):
|
||||
if isinstance(result, Exception):
|
||||
logger.error(f"Error crawling URL {urls[idx]}: {str(result)}")
|
||||
processedResults.append({"url": urls[idx], "error": str(result)})
|
||||
else:
|
||||
processedResults.extend(result if isinstance(result, list) else [result])
|
||||
|
||||
logger.info(f"Completed parallel crawl: {len(processedResults)} results")
|
||||
return processedResults
|
||||
|
||||
async def _crawlSingleUrl(
|
||||
self,
|
||||
url: str,
|
||||
urlIndex: int,
|
||||
totalUrls: int,
|
||||
instruction: str,
|
||||
maxDepth: int,
|
||||
parentOperationId: Optional[str] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Crawl a single URL - called in parallel for multiple URLs.
|
||||
|
||||
Args:
|
||||
url: URL to crawl
|
||||
urlIndex: Index of URL in the list
|
||||
totalUrls: Total number of URLs being crawled
|
||||
instruction: Research instruction
|
||||
maxDepth: Maximum crawl depth
|
||||
parentOperationId: Parent operation ID for progress tracking
|
||||
|
||||
Returns:
|
||||
List of crawl results for this URL
|
||||
"""
|
||||
# Create separate operation for each URL with parent reference
|
||||
urlOperationId = None
|
||||
if parentOperationId:
|
||||
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
||||
urlOperationId = f"web_crawl_url_{workflowId}_{urlIndex}_{int(time.time())}"
|
||||
self.services.chat.progressLogStart(
|
||||
urlOperationId,
|
||||
"Web Crawl",
|
||||
f"URL {urlIndex + 1}/{totalUrls}",
|
||||
url[:50] + "..." if len(url) > 50 else url,
|
||||
parentOperationId=parentOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
logger.info(f"Crawling URL {urlIndex + 1}/{totalUrls}: {url}")
|
||||
|
||||
if urlOperationId:
|
||||
displayUrl = url[:50] + "..." if len(url) > 50 else url
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.2, f"Crawling: {displayUrl}")
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.3, "Initiating crawl")
|
||||
|
||||
# Build crawl prompt model for single URL
|
||||
crawlPromptModel = AiCallPromptWebCrawl(
|
||||
instruction=instruction,
|
||||
url=url, # Single URL
|
||||
maxDepth=maxDepth,
|
||||
maxWidth=5 # Default: 5 pages per level
|
||||
)
|
||||
crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)
|
||||
|
||||
# Debug: persist crawl prompt (with URL identifier in content for clarity)
|
||||
debugPrompt = f"URL: {url}\n\n{crawlPrompt}"
|
||||
self.services.utils.writeDebugFile(debugPrompt, "webcrawl_prompt")
|
||||
|
||||
# Call AI with WEB_CRAWL operation
|
||||
crawlOptions = AiCallOptions(
|
||||
operationType=OperationTypeEnum.WEB_CRAWL,
|
||||
resultFormat="json"
|
||||
)
|
||||
|
||||
if urlOperationId:
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.4, "Calling crawl connector")
|
||||
|
||||
# Use unified callAiContent method with parentOperationId for hierarchical logging
|
||||
crawlResponse = await self.services.ai.callAiContent(
|
||||
prompt=crawlPrompt,
|
||||
options=crawlOptions,
|
||||
outputFormat="json",
|
||||
parentOperationId=urlOperationId # Pass URL operation ID as parent for sub-URL logging
|
||||
)
|
||||
|
||||
if urlOperationId:
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.7, "Processing crawl results")
|
||||
|
||||
# Extract content from AiResponse
|
||||
crawlResult = crawlResponse.content
|
||||
|
||||
# Debug: persist crawl response
|
||||
if isinstance(crawlResult, str):
|
||||
self.services.utils.writeDebugFile(crawlResult, "webcrawl_response")
|
||||
else:
|
||||
self.services.utils.writeDebugFile(json.dumps(crawlResult, indent=2), "webcrawl_response")
|
||||
|
||||
# Parse crawl result
|
||||
if isinstance(crawlResult, str):
|
||||
try:
|
||||
# Extract JSON from response (handles markdown code blocks)
|
||||
extractedJson = self.services.utils.jsonExtractString(crawlResult)
|
||||
crawlData = json.loads(extractedJson) if extractedJson else json.loads(crawlResult)
|
||||
except:
|
||||
crawlData = {"url": url, "content": crawlResult}
|
||||
else:
|
||||
crawlData = crawlResult
|
||||
|
||||
# Process crawl results and create hierarchical progress logging for sub-URLs
|
||||
if urlOperationId:
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.8, "Processing crawl results")
|
||||
|
||||
# Recursively process crawl results to find nested URLs and create child operations
|
||||
processedResults = self._processCrawlResultsWithHierarchy(crawlData, url, urlOperationId, maxDepth, 0)
|
||||
|
||||
# Count total URLs crawled (including sub-URLs) for progress message
|
||||
totalUrlsCrawled = self._countUrlsInResults(processedResults)
|
||||
|
||||
# Ensure it's a list of results
|
||||
if isinstance(processedResults, list):
|
||||
results = processedResults
|
||||
elif isinstance(processedResults, dict):
|
||||
results = [processedResults]
|
||||
else:
|
||||
results = [{"url": url, "content": str(processedResults)}]
|
||||
|
||||
if urlOperationId:
|
||||
if totalUrlsCrawled > 1:
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.9, f"Crawled {totalUrlsCrawled} URLs (including sub-URLs)")
|
||||
else:
|
||||
self.services.chat.progressLogUpdate(urlOperationId, 0.9, "Crawl completed")
|
||||
self.services.chat.progressLogFinish(urlOperationId, True)
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error crawling URL {url}: {str(e)}")
|
||||
if urlOperationId:
|
||||
self.services.chat.progressLogFinish(urlOperationId, False)
|
||||
return [{"url": url, "error": str(e)}]
|
||||
|
||||
def _processCrawlResultsWithHierarchy(
|
||||
self,
|
||||
|
|
|
|||
Loading…
Reference in a new issue