web service parallelized

This commit is contained in:
ValueOn AG 2025-12-30 10:48:16 +01:00
parent 0d77263fb7
commit cb7ed7cf51
3 changed files with 285 additions and 163 deletions

View file

@ -122,9 +122,13 @@ class AiCallLooper:
) )
# Write the ACTUAL prompt sent to AI # Write the ACTUAL prompt sent to AI
# For section content generation: only write one prompt file (first iteration)
# For document generation: write prompt for each iteration
isSectionContent = "_section_" in debugPrefix
if iteration == 1 or not isSectionContent:
if iteration == 1: if iteration == 1:
self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt") self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt")
else: elif not isSectionContent:
self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt_iteration_{iteration}") self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt_iteration_{iteration}")
response = await self.aiService.callAi(request) response = await self.aiService.callAi(request)
@ -146,9 +150,12 @@ class AiCallLooper:
self.services.chat.progressLogUpdate(iterationOperationId, 0.6, f"AI response received ({bytesDisplay})") self.services.chat.progressLogUpdate(iterationOperationId, 0.6, f"AI response received ({bytesDisplay})")
# Write raw AI response to debug file # Write raw AI response to debug file
# For section content generation: only write one response file (first iteration)
# For document generation: write response for each iteration
if iteration == 1 or not isSectionContent:
if iteration == 1: if iteration == 1:
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response") self.services.utils.writeDebugFile(result, f"{debugPrefix}_response")
else: elif not isSectionContent:
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response_iteration_{iteration}") self.services.utils.writeDebugFile(result, f"{debugPrefix}_response_iteration_{iteration}")
# Emit stats for this iteration (only if workflow exists and has id) # Emit stats for this iteration (only if workflow exists and has id)
@ -219,9 +226,9 @@ class AiCallLooper:
logger.info(f"Iteration {iteration}: Section content generation detected (elements found), returning JSON directly") logger.info(f"Iteration {iteration}: Section content generation detected (elements found), returning JSON directly")
if iterationOperationId: if iterationOperationId:
self.services.chat.progressLogFinish(iterationOperationId, True) self.services.chat.progressLogFinish(iterationOperationId, True)
# Write final result # Note: Debug files (_prompt and _response) are already written above for iteration 1
# No need to write _final_result as it's redundant with _response
final_json = json.dumps(parsedJsonForSection, indent=2, ensure_ascii=False) if parsedJsonForSection else (extractedJsonForSection or result) final_json = json.dumps(parsedJsonForSection, indent=2, ensure_ascii=False) if parsedJsonForSection else (extractedJsonForSection or result)
self.services.utils.writeDebugFile(final_json, f"{debugPrefix}_final_result")
return final_json return final_json
# Extract sections from response (handles both valid and broken JSON) # Extract sections from response (handles both valid and broken JSON)
@ -397,6 +404,9 @@ class AiCallLooper:
self.services.chat.progressLogUpdate(operationId, estimatedProgress, f"Pipeline: {bytesDisplay} (iteration {iteration})") self.services.chat.progressLogUpdate(operationId, estimatedProgress, f"Pipeline: {bytesDisplay} (iteration {iteration})")
# Log merged sections for debugging # Log merged sections for debugging
# For section content generation: skip merged sections debug files (only one prompt/response needed)
isSectionContent = "_section_" in debugPrefix
if not isSectionContent:
self.services.utils.writeDebugFile(merged_json_str, f"{debugPrefix}_merged_sections_iteration_{iteration}") self.services.utils.writeDebugFile(merged_json_str, f"{debugPrefix}_merged_sections_iteration_{iteration}")
# Check if we should continue (completion detection) # Check if we should continue (completion detection)
@ -465,6 +475,9 @@ class AiCallLooper:
final_result = self.responseParser.buildFinalResultFromSections(allSections, documentMetadata) final_result = self.responseParser.buildFinalResultFromSections(allSections, documentMetadata)
# Write final result to debug file # Write final result to debug file
# For section content generation: skip final_result debug file (response already written)
isSectionContent = "_section_" in debugPrefix
if not isSectionContent:
self.services.utils.writeDebugFile(final_result, f"{debugPrefix}_final_result") self.services.utils.writeDebugFile(final_result, f"{debugPrefix}_final_result")
return final_result return final_result

View file

@ -537,11 +537,6 @@ class StructureFiller:
try: try:
self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt") self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt")
self.services.utils.writeDebugFile(
generationPrompt,
f"{chapterId}_section_{sectionId}_prompt"
)
logger.debug(f"Logged section prompt: {chapterId}_section_{sectionId}_prompt (aggregation)")
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation") self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
@ -553,6 +548,12 @@ class StructureFiller:
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters") logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0] generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0]
# Write debug file for IMAGE_GENERATE (direct callAi, no _callAiWithLooping)
self.services.utils.writeDebugFile(
generationPrompt,
f"{chapterId}_section_{sectionId}_prompt"
)
request = AiCallRequest( request = AiCallRequest(
prompt=generationPrompt, prompt=generationPrompt,
contentParts=[], contentParts=[],
@ -564,6 +565,12 @@ class StructureFiller:
) )
aiResponse = await self.aiService.callAi(request) aiResponse = await self.aiService.callAi(request)
generatedElements = [] generatedElements = []
# Write debug file for IMAGE_GENERATE response (direct callAi, no _callAiWithLooping)
self.services.utils.writeDebugFile(
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
f"{chapterId}_section_{sectionId}_response"
)
else: else:
async def buildSectionPromptWithContinuation( async def buildSectionPromptWithContinuation(
section: Dict[str, Any], section: Dict[str, Any],
@ -665,11 +672,7 @@ The JSON should be a fragment that can be merged with the previous response."""
generatedElements = [] generatedElements = []
self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response") self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response")
self.services.utils.writeDebugFile( # Note: Debug files are written by _callAiWithLooping using debugPrefix
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
f"{chapterId}_section_{sectionId}_response"
)
logger.debug(f"Logged section response: {chapterId}_section_{sectionId}_response (aggregation)")
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content") self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
@ -735,11 +738,6 @@ The JSON should be a fragment that can be merged with the previous response."""
try: try:
self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt") self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt")
self.services.utils.writeDebugFile(
generationPrompt,
f"{chapterId}_section_{sectionId}_prompt"
)
logger.debug(f"Logged section prompt: {chapterId}_section_{sectionId}_prompt")
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation") self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
@ -751,6 +749,12 @@ The JSON should be a fragment that can be merged with the previous response."""
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters") logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0] generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0]
# Write debug file for IMAGE_GENERATE (direct callAi, no _callAiWithLooping)
self.services.utils.writeDebugFile(
generationPrompt,
f"{chapterId}_section_{sectionId}_prompt"
)
request = AiCallRequest( request = AiCallRequest(
prompt=generationPrompt, prompt=generationPrompt,
contentParts=[], contentParts=[],
@ -762,6 +766,12 @@ The JSON should be a fragment that can be merged with the previous response."""
) )
aiResponse = await self.aiService.callAi(request) aiResponse = await self.aiService.callAi(request)
generatedElements = [] generatedElements = []
# Write debug file for IMAGE_GENERATE response (direct callAi, no _callAiWithLooping)
self.services.utils.writeDebugFile(
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
f"{chapterId}_section_{sectionId}_response"
)
else: else:
isAggregation = False isAggregation = False
@ -865,11 +875,7 @@ The JSON should be a fragment that can be merged with the previous response."""
generatedElements = [] generatedElements = []
self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response") self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response")
self.services.utils.writeDebugFile( # Note: Debug files are written by _callAiWithLooping using debugPrefix
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
f"{chapterId}_section_{sectionId}_response"
)
logger.debug(f"Logged section response: {chapterId}_section_{sectionId}_response")
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content") self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
@ -968,11 +974,6 @@ The JSON should be a fragment that can be merged with the previous response."""
try: try:
self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt") self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt")
self.services.utils.writeDebugFile(
generationPrompt,
f"{chapterId}_section_{sectionId}_prompt"
)
logger.debug(f"Logged section prompt: {chapterId}_section_{sectionId}_prompt")
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation") self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
@ -984,6 +985,12 @@ The JSON should be a fragment that can be merged with the previous response."""
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters") logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0] generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0]
# Write debug file for IMAGE_GENERATE (direct callAi, no _callAiWithLooping)
self.services.utils.writeDebugFile(
generationPrompt,
f"{chapterId}_section_{sectionId}_prompt"
)
request = AiCallRequest( request = AiCallRequest(
prompt=generationPrompt, prompt=generationPrompt,
contentParts=[], contentParts=[],
@ -995,6 +1002,12 @@ The JSON should be a fragment that can be merged with the previous response."""
) )
aiResponse = await self.aiService.callAi(request) aiResponse = await self.aiService.callAi(request)
generatedElements = [] generatedElements = []
# Write debug file for IMAGE_GENERATE response (direct callAi, no _callAiWithLooping)
self.services.utils.writeDebugFile(
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
f"{chapterId}_section_{sectionId}_response"
)
else: else:
isAggregation = False isAggregation = False
@ -1098,11 +1111,7 @@ The JSON should be a fragment that can be merged with the previous response."""
generatedElements = [] generatedElements = []
self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response") self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response")
self.services.utils.writeDebugFile( # Note: Debug files are written by _callAiWithLooping using debugPrefix
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
f"{chapterId}_section_{sectionId}_response"
)
logger.debug(f"Logged section response: {chapterId}_section_{sectionId}_response")
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content") self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")

View file

@ -8,6 +8,8 @@ Manages the two-step process: WEB_SEARCH then WEB_CRAWL.
import json import json
import logging import logging
import time import time
import asyncio
from urllib.parse import urlparse
from typing import Dict, Any, List, Optional from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallPromptWebSearch, AiCallPromptWebCrawl from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallPromptWebSearch, AiCallPromptWebCrawl
@ -99,12 +101,18 @@ class WebService:
self.services.chat.progressLogUpdate(operationId, 0.5, f"Found {len(allUrls)} total URLs") self.services.chat.progressLogUpdate(operationId, 0.5, f"Found {len(allUrls)} total URLs")
# Step 3: Filter to maxNumberPages (simple cut, no intelligent filtering) # Step 3: Validate and filter URLs before crawling
if len(allUrls) > maxNumberPages: validatedUrls = self._validateUrls(allUrls)
allUrls = allUrls[:maxNumberPages] if not validatedUrls:
logger.warning(f"All {len(allUrls)} URLs failed validation")
return {"error": "No valid URLs found to crawl"}
# Filter to maxNumberPages (simple cut, no intelligent filtering)
if len(validatedUrls) > maxNumberPages:
validatedUrls = validatedUrls[:maxNumberPages]
logger.info(f"Limited URLs to {maxNumberPages}") logger.info(f"Limited URLs to {maxNumberPages}")
if not allUrls: if not validatedUrls:
return {"error": "No URLs found to crawl"} return {"error": "No URLs found to crawl"}
# Step 4: Translate researchDepth to maxDepth # Step 4: Translate researchDepth to maxDepth
@ -114,14 +122,14 @@ class WebService:
# Step 5: Crawl all URLs with hierarchical logging # Step 5: Crawl all URLs with hierarchical logging
if operationId: if operationId:
self.services.chat.progressLogUpdate(operationId, 0.4, "Initiating") self.services.chat.progressLogUpdate(operationId, 0.4, "Initiating")
self.services.chat.progressLogUpdate(operationId, 0.6, f"Crawling {len(allUrls)} URLs") self.services.chat.progressLogUpdate(operationId, 0.6, f"Crawling {len(validatedUrls)} URLs")
# Use parent operation ID directly (parentId should be operationId, not log entry ID) # Use parent operation ID directly (parentId should be operationId, not log entry ID)
parentOperationId = operationId # Use the parent's operationId directly parentOperationId = operationId # Use the parent's operationId directly
crawlResult = await self._performWebCrawl( crawlResult = await self._performWebCrawl(
instruction=instruction, instruction=instruction,
urls=allUrls, urls=validatedUrls,
maxDepth=maxDepth, maxDepth=maxDepth,
parentOperationId=parentOperationId parentOperationId=parentOperationId
) )
@ -194,8 +202,8 @@ class WebService:
"max_depth": maxDepth, "max_depth": maxDepth,
"country": countryCode, "country": countryCode,
"language": languageCode, "language": languageCode,
"urls_crawled": allUrls[:20], # First 20 URLs for reference "urls_crawled": validatedUrls[:20], # First 20 URLs for reference
"total_urls": len(allUrls), "total_urls": len(validatedUrls),
"urls_with_content": urlsWithContent, "urls_with_content": urlsWithContent,
"total_content_length": totalContentLength, "total_content_length": totalContentLength,
"crawl_date": self.services.utils.timestampGetUtc() if hasattr(self.services, 'utils') else None "crawl_date": self.services.utils.timestampGetUtc() if hasattr(self.services, 'utils') else None
@ -203,15 +211,15 @@ class WebService:
"sections": sections, "sections": sections,
"statistics": { "statistics": {
"sectionCount": len(sections), "sectionCount": len(sections),
"total_urls": len(allUrls), "total_urls": len(validatedUrls),
"results_count": totalResults, "results_count": totalResults,
"urls_with_content": urlsWithContent, "urls_with_content": urlsWithContent,
"total_content_length": totalContentLength "total_content_length": totalContentLength
}, },
# Keep original structure for backward compatibility # Keep original structure for backward compatibility
"instruction": instruction, "instruction": instruction,
"urls_crawled": allUrls, "urls_crawled": validatedUrls,
"total_urls": len(allUrls), "total_urls": len(validatedUrls),
"results": crawlResult, "results": crawlResult,
"total_results": totalResults "total_results": totalResults
} }
@ -383,6 +391,50 @@ Return ONLY valid JSON, no additional text:
logger.error(f"Error in web search: {str(e)}") logger.error(f"Error in web search: {str(e)}")
return [] return []
def _validateUrls(self, urls: List[str]) -> List[str]:
"""
Validate URLs before crawling - filters out invalid URLs.
Args:
urls: List of URLs to validate
Returns:
List of valid URLs
"""
validatedUrls = []
for url in urls:
if not url or not isinstance(url, str):
logger.debug(f"Skipping invalid URL (not a string): {url}")
continue
url = url.strip()
if not url:
logger.debug(f"Skipping empty URL")
continue
# Basic URL validation using urlparse
try:
parsed = urlparse(url)
# Check if URL has at least scheme and netloc
if not parsed.scheme or not parsed.netloc:
logger.debug(f"Skipping invalid URL (missing scheme or netloc): {url}")
continue
# Only allow http/https schemes
if parsed.scheme not in ['http', 'https']:
logger.debug(f"Skipping URL with unsupported scheme '{parsed.scheme}': {url}")
continue
validatedUrls.append(url)
logger.debug(f"Validated URL: {url}")
except Exception as e:
logger.warning(f"Error validating URL '{url}': {str(e)}")
continue
logger.info(f"Validated {len(validatedUrls)}/{len(urls)} URLs")
return validatedUrls
async def _performWebCrawl( async def _performWebCrawl(
self, self,
instruction: str, instruction: str,
@ -390,11 +442,59 @@ Return ONLY valid JSON, no additional text:
maxDepth: int = 2, maxDepth: int = 2,
parentOperationId: Optional[str] = None parentOperationId: Optional[str] = None
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
"""Perform web crawl on list of URLs - calls plugin for each URL individually.""" """Perform web crawl on list of URLs - crawls URLs in parallel for better performance."""
crawlResults = [] # Create tasks for parallel crawling
crawlTasks = []
# Loop over each URL and crawl one at a time
for urlIndex, url in enumerate(urls): for urlIndex, url in enumerate(urls):
task = self._crawlSingleUrl(
url=url,
urlIndex=urlIndex,
totalUrls=len(urls),
instruction=instruction,
maxDepth=maxDepth,
parentOperationId=parentOperationId
)
crawlTasks.append(task)
# Execute all crawl tasks in parallel
logger.info(f"Starting parallel crawl of {len(urls)} URLs")
crawlResults = await asyncio.gather(*crawlTasks, return_exceptions=True)
# Process results and handle exceptions
processedResults = []
for idx, result in enumerate(crawlResults):
if isinstance(result, Exception):
logger.error(f"Error crawling URL {urls[idx]}: {str(result)}")
processedResults.append({"url": urls[idx], "error": str(result)})
else:
processedResults.extend(result if isinstance(result, list) else [result])
logger.info(f"Completed parallel crawl: {len(processedResults)} results")
return processedResults
async def _crawlSingleUrl(
self,
url: str,
urlIndex: int,
totalUrls: int,
instruction: str,
maxDepth: int,
parentOperationId: Optional[str] = None
) -> List[Dict[str, Any]]:
"""
Crawl a single URL - called in parallel for multiple URLs.
Args:
url: URL to crawl
urlIndex: Index of URL in the list
totalUrls: Total number of URLs being crawled
instruction: Research instruction
maxDepth: Maximum crawl depth
parentOperationId: Parent operation ID for progress tracking
Returns:
List of crawl results for this URL
"""
# Create separate operation for each URL with parent reference # Create separate operation for each URL with parent reference
urlOperationId = None urlOperationId = None
if parentOperationId: if parentOperationId:
@ -403,13 +503,13 @@ Return ONLY valid JSON, no additional text:
self.services.chat.progressLogStart( self.services.chat.progressLogStart(
urlOperationId, urlOperationId,
"Web Crawl", "Web Crawl",
f"URL {urlIndex + 1}", f"URL {urlIndex + 1}/{totalUrls}",
url[:50] + "..." if len(url) > 50 else url, url[:50] + "..." if len(url) > 50 else url,
parentOperationId=parentOperationId parentOperationId=parentOperationId
) )
try: try:
logger.info(f"Crawling URL {urlIndex + 1}/{len(urls)}: {url}") logger.info(f"Crawling URL {urlIndex + 1}/{totalUrls}: {url}")
if urlOperationId: if urlOperationId:
displayUrl = url[:50] + "..." if len(url) > 50 else url displayUrl = url[:50] + "..." if len(url) > 50 else url
@ -481,11 +581,11 @@ Return ONLY valid JSON, no additional text:
# Ensure it's a list of results # Ensure it's a list of results
if isinstance(processedResults, list): if isinstance(processedResults, list):
crawlResults.extend(processedResults) results = processedResults
elif isinstance(processedResults, dict): elif isinstance(processedResults, dict):
crawlResults.append(processedResults) results = [processedResults]
else: else:
crawlResults.append({"url": url, "content": str(processedResults)}) results = [{"url": url, "content": str(processedResults)}]
if urlOperationId: if urlOperationId:
if totalUrlsCrawled > 1: if totalUrlsCrawled > 1:
@ -494,13 +594,13 @@ Return ONLY valid JSON, no additional text:
self.services.chat.progressLogUpdate(urlOperationId, 0.9, "Crawl completed") self.services.chat.progressLogUpdate(urlOperationId, 0.9, "Crawl completed")
self.services.chat.progressLogFinish(urlOperationId, True) self.services.chat.progressLogFinish(urlOperationId, True)
return results
except Exception as e: except Exception as e:
logger.error(f"Error crawling URL {url}: {str(e)}") logger.error(f"Error crawling URL {url}: {str(e)}")
if urlOperationId: if urlOperationId:
self.services.chat.progressLogFinish(urlOperationId, False) self.services.chat.progressLogFinish(urlOperationId, False)
crawlResults.append({"url": url, "error": str(e)}) return [{"url": url, "error": str(e)}]
return crawlResults
def _processCrawlResultsWithHierarchy( def _processCrawlResultsWithHierarchy(
self, self,