web service parallelized

2025-12-30 10:48:16 +01:00 · 2025-12-30 10:48:16 +01:00 · cb7ed7cf51
commit cb7ed7cf51
parent 0d77263fb7
3 changed files with 285 additions and 163 deletions
--- a/modules/services/serviceAi/subAiCallLooping.py
+++ b/modules/services/serviceAi/subAiCallLooping.py
@ -122,9 +122,13 @@ class AiCallLooper:
                )
                # Write the ACTUAL prompt sent to AI
                # For section content generation: only write one prompt file (first iteration)
                # For document generation: write prompt for each iteration
                isSectionContent = "_section_" in debugPrefix
                if iteration == 1 or not isSectionContent:
                    if iteration == 1:
                        self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt")
-                else:
+                    elif not isSectionContent:
                        self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt_iteration_{iteration}")
                response = await self.aiService.callAi(request)
@ -146,9 +150,12 @@ class AiCallLooper:
                    self.services.chat.progressLogUpdate(iterationOperationId, 0.6, f"AI response received ({bytesDisplay})")
                # Write raw AI response to debug file
                # For section content generation: only write one response file (first iteration)
                # For document generation: write response for each iteration
                if iteration == 1 or not isSectionContent:
                    if iteration == 1:
                        self.services.utils.writeDebugFile(result, f"{debugPrefix}_response")
-                else:
+                    elif not isSectionContent:
                        self.services.utils.writeDebugFile(result, f"{debugPrefix}_response_iteration_{iteration}")
                # Emit stats for this iteration (only if workflow exists and has id)
@ -219,9 +226,9 @@ class AiCallLooper:
                    logger.info(f"Iteration {iteration}: Section content generation detected (elements found), returning JSON directly")
                    if iterationOperationId:
                        self.services.chat.progressLogFinish(iterationOperationId, True)
-                    # Write final result
+                    # Note: Debug files (_prompt and _response) are already written above for iteration 1
                    # No need to write _final_result as it's redundant with _response
                    final_json = json.dumps(parsedJsonForSection, indent=2, ensure_ascii=False) if parsedJsonForSection else (extractedJsonForSection or result)
                    self.services.utils.writeDebugFile(final_json, f"{debugPrefix}_final_result")
                    return final_json
                # Extract sections from response (handles both valid and broken JSON)
@ -397,6 +404,9 @@ class AiCallLooper:
                    self.services.chat.progressLogUpdate(operationId, estimatedProgress, f"Pipeline: {bytesDisplay} (iteration {iteration})")
                # Log merged sections for debugging
                # For section content generation: skip merged sections debug files (only one prompt/response needed)
                isSectionContent = "_section_" in debugPrefix
                if not isSectionContent:
                    self.services.utils.writeDebugFile(merged_json_str, f"{debugPrefix}_merged_sections_iteration_{iteration}")
                # Check if we should continue (completion detection)
@ -465,6 +475,9 @@ class AiCallLooper:
        final_result = self.responseParser.buildFinalResultFromSections(allSections, documentMetadata)
        # Write final result to debug file
        # For section content generation: skip final_result debug file (response already written)
        isSectionContent = "_section_" in debugPrefix
        if not isSectionContent:
            self.services.utils.writeDebugFile(final_result, f"{debugPrefix}_final_result")
        return final_result
--- a/modules/services/serviceAi/subStructureFilling.py
+++ b/modules/services/serviceAi/subStructureFilling.py
@ -537,11 +537,6 @@ class StructureFiller:
                        try:
                            self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt")
                            self.services.utils.writeDebugFile(
                                generationPrompt,
                                f"{chapterId}_section_{sectionId}_prompt"
                            )
                            logger.debug(f"Logged section prompt: {chapterId}_section_{sectionId}_prompt (aggregation)")
                            self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
@ -553,6 +548,12 @@ class StructureFiller:
                                    logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
                                    generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0]
                                # Write debug file for IMAGE_GENERATE (direct callAi, no _callAiWithLooping)
                                self.services.utils.writeDebugFile(
                                    generationPrompt,
                                    f"{chapterId}_section_{sectionId}_prompt"
                                )
                                request = AiCallRequest(
                                    prompt=generationPrompt,
                                    contentParts=[],
@ -564,6 +565,12 @@ class StructureFiller:
                                )
                                aiResponse = await self.aiService.callAi(request)
                                generatedElements = []
                                # Write debug file for IMAGE_GENERATE response (direct callAi, no _callAiWithLooping)
                                self.services.utils.writeDebugFile(
                                    aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
                                    f"{chapterId}_section_{sectionId}_response"
                                )
                            else:
                                async def buildSectionPromptWithContinuation(
                                    section: Dict[str, Any],
@ -665,11 +672,7 @@ The JSON should be a fragment that can be merged with the previous response."""
                                    generatedElements = []
                            self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response")
-                            self.services.utils.writeDebugFile(
+                            # Note: Debug files are written by _callAiWithLooping using debugPrefix
                                aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
                                f"{chapterId}_section_{sectionId}_response"
                            )
                            logger.debug(f"Logged section response: {chapterId}_section_{sectionId}_response (aggregation)")
                            self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
@ -735,11 +738,6 @@ The JSON should be a fragment that can be merged with the previous response."""
                    try:
                        self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt")
                        self.services.utils.writeDebugFile(
                            generationPrompt,
                            f"{chapterId}_section_{sectionId}_prompt"
                        )
                        logger.debug(f"Logged section prompt: {chapterId}_section_{sectionId}_prompt")
                        self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
@ -751,6 +749,12 @@ The JSON should be a fragment that can be merged with the previous response."""
                                logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
                                generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0]
                            # Write debug file for IMAGE_GENERATE (direct callAi, no _callAiWithLooping)
                            self.services.utils.writeDebugFile(
                                generationPrompt,
                                f"{chapterId}_section_{sectionId}_prompt"
                            )
                            request = AiCallRequest(
                                prompt=generationPrompt,
                                contentParts=[],
@ -762,6 +766,12 @@ The JSON should be a fragment that can be merged with the previous response."""
                            )
                            aiResponse = await self.aiService.callAi(request)
                            generatedElements = []
                            # Write debug file for IMAGE_GENERATE response (direct callAi, no _callAiWithLooping)
                            self.services.utils.writeDebugFile(
                                aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
                                f"{chapterId}_section_{sectionId}_response"
                            )
                        else:
                            isAggregation = False
@ -865,11 +875,7 @@ The JSON should be a fragment that can be merged with the previous response."""
                                generatedElements = []
                        self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response")
-                        self.services.utils.writeDebugFile(
+                        # Note: Debug files are written by _callAiWithLooping using debugPrefix
                            aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
                            f"{chapterId}_section_{sectionId}_response"
                        )
                        logger.debug(f"Logged section response: {chapterId}_section_{sectionId}_response")
                        self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
@ -968,11 +974,6 @@ The JSON should be a fragment that can be merged with the previous response."""
                            try:
                                self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt")
                                self.services.utils.writeDebugFile(
                                    generationPrompt,
                                    f"{chapterId}_section_{sectionId}_prompt"
                                )
                                logger.debug(f"Logged section prompt: {chapterId}_section_{sectionId}_prompt")
                                self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
@ -984,6 +985,12 @@ The JSON should be a fragment that can be merged with the previous response."""
                                        logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
                                        generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0]
                                    # Write debug file for IMAGE_GENERATE (direct callAi, no _callAiWithLooping)
                                    self.services.utils.writeDebugFile(
                                        generationPrompt,
                                        f"{chapterId}_section_{sectionId}_prompt"
                                    )
                                    request = AiCallRequest(
                                        prompt=generationPrompt,
                                        contentParts=[],
@ -995,6 +1002,12 @@ The JSON should be a fragment that can be merged with the previous response."""
                                    )
                                    aiResponse = await self.aiService.callAi(request)
                                    generatedElements = []
                                    # Write debug file for IMAGE_GENERATE response (direct callAi, no _callAiWithLooping)
                                    self.services.utils.writeDebugFile(
                                        aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
                                        f"{chapterId}_section_{sectionId}_response"
                                    )
                                else:
                                    isAggregation = False
@ -1098,11 +1111,7 @@ The JSON should be a fragment that can be merged with the previous response."""
                                        generatedElements = []
                                self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response")
-                                self.services.utils.writeDebugFile(
+                                # Note: Debug files are written by _callAiWithLooping using debugPrefix
                                    aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
                                    f"{chapterId}_section_{sectionId}_response"
                                )
                                logger.debug(f"Logged section response: {chapterId}_section_{sectionId}_response")
                                self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
--- a/modules/services/serviceWeb/mainServiceWeb.py
+++ b/modules/services/serviceWeb/mainServiceWeb.py
@ -8,6 +8,8 @@ Manages the two-step process: WEB_SEARCH then WEB_CRAWL.
 import json
 import logging
 import time
 import asyncio
 from urllib.parse import urlparse
 from typing import Dict, Any, List, Optional
 from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallPromptWebSearch, AiCallPromptWebCrawl
@ -99,12 +101,18 @@ class WebService:
                self.services.chat.progressLogUpdate(operationId, 0.5, f"Found {len(allUrls)} total URLs")
-            # Step 3: Filter to maxNumberPages (simple cut, no intelligent filtering)
+            # Step 3: Validate and filter URLs before crawling
-            if len(allUrls) > maxNumberPages:
+            validatedUrls = self._validateUrls(allUrls)
-                allUrls = allUrls[:maxNumberPages]
+            if not validatedUrls:
                logger.warning(f"All {len(allUrls)} URLs failed validation")
                return {"error": "No valid URLs found to crawl"}
            # Filter to maxNumberPages (simple cut, no intelligent filtering)
            if len(validatedUrls) > maxNumberPages:
                validatedUrls = validatedUrls[:maxNumberPages]
                logger.info(f"Limited URLs to {maxNumberPages}")
-            if not allUrls:
+            if not validatedUrls:
                return {"error": "No URLs found to crawl"}
            # Step 4: Translate researchDepth to maxDepth
@ -114,14 +122,14 @@ class WebService:
            # Step 5: Crawl all URLs with hierarchical logging
            if operationId:
                self.services.chat.progressLogUpdate(operationId, 0.4, "Initiating")
-                self.services.chat.progressLogUpdate(operationId, 0.6, f"Crawling {len(allUrls)} URLs")
+                self.services.chat.progressLogUpdate(operationId, 0.6, f"Crawling {len(validatedUrls)} URLs")
            # Use parent operation ID directly (parentId should be operationId, not log entry ID)
            parentOperationId = operationId  # Use the parent's operationId directly
            crawlResult = await self._performWebCrawl(
                instruction=instruction,
-                urls=allUrls,
+                urls=validatedUrls,
                maxDepth=maxDepth,
                parentOperationId=parentOperationId
            )
@ -194,8 +202,8 @@ class WebService:
                    "max_depth": maxDepth,
                    "country": countryCode,
                    "language": languageCode,
-                    "urls_crawled": allUrls[:20],  # First 20 URLs for reference
+                    "urls_crawled": validatedUrls[:20],  # First 20 URLs for reference
-                    "total_urls": len(allUrls),
+                    "total_urls": len(validatedUrls),
                    "urls_with_content": urlsWithContent,
                    "total_content_length": totalContentLength,
                    "crawl_date": self.services.utils.timestampGetUtc() if hasattr(self.services, 'utils') else None
@ -203,15 +211,15 @@ class WebService:
                "sections": sections,
                "statistics": {
                "sectionCount": len(sections),
-                    "total_urls": len(allUrls),
+                "total_urls": len(validatedUrls),
                    "results_count": totalResults,
                    "urls_with_content": urlsWithContent,
                    "total_content_length": totalContentLength
                },
                # Keep original structure for backward compatibility
                "instruction": instruction,
-                "urls_crawled": allUrls,
+                "urls_crawled": validatedUrls,
-                "total_urls": len(allUrls),
+                "total_urls": len(validatedUrls),
                "results": crawlResult,
                "total_results": totalResults
            }
@ -383,6 +391,50 @@ Return ONLY valid JSON, no additional text:
            logger.error(f"Error in web search: {str(e)}")
            return []
    def _validateUrls(self, urls: List[str]) -> List[str]:
        """
        Validate URLs before crawling - filters out invalid URLs.
        Args:
            urls: List of URLs to validate
        Returns:
            List of valid URLs
        """
        validatedUrls = []
        for url in urls:
            if not url or not isinstance(url, str):
                logger.debug(f"Skipping invalid URL (not a string): {url}")
                continue
            url = url.strip()
            if not url:
                logger.debug(f"Skipping empty URL")
                continue
            # Basic URL validation using urlparse
            try:
                parsed = urlparse(url)
                # Check if URL has at least scheme and netloc
                if not parsed.scheme or not parsed.netloc:
                    logger.debug(f"Skipping invalid URL (missing scheme or netloc): {url}")
                    continue
                # Only allow http/https schemes
                if parsed.scheme not in ['http', 'https']:
                    logger.debug(f"Skipping URL with unsupported scheme '{parsed.scheme}': {url}")
                    continue
                validatedUrls.append(url)
                logger.debug(f"Validated URL: {url}")
            except Exception as e:
                logger.warning(f"Error validating URL '{url}': {str(e)}")
                continue
        logger.info(f"Validated {len(validatedUrls)}/{len(urls)} URLs")
        return validatedUrls
    async def _performWebCrawl(
        self,
        instruction: str,
@ -390,11 +442,59 @@ Return ONLY valid JSON, no additional text:
        maxDepth: int = 2,
        parentOperationId: Optional[str] = None
        ) -> List[Dict[str, Any]]:
-        """Perform web crawl on list of URLs - calls plugin for each URL individually."""
+        """Perform web crawl on list of URLs - crawls URLs in parallel for better performance."""
-        crawlResults = []
+        # Create tasks for parallel crawling
-        
+        crawlTasks = []
        # Loop over each URL and crawl one at a time
        for urlIndex, url in enumerate(urls):
            task = self._crawlSingleUrl(
                url=url,
                urlIndex=urlIndex,
                totalUrls=len(urls),
                instruction=instruction,
                maxDepth=maxDepth,
                parentOperationId=parentOperationId
            )
            crawlTasks.append(task)
        # Execute all crawl tasks in parallel
        logger.info(f"Starting parallel crawl of {len(urls)} URLs")
        crawlResults = await asyncio.gather(*crawlTasks, return_exceptions=True)
        # Process results and handle exceptions
        processedResults = []
        for idx, result in enumerate(crawlResults):
            if isinstance(result, Exception):
                logger.error(f"Error crawling URL {urls[idx]}: {str(result)}")
                processedResults.append({"url": urls[idx], "error": str(result)})
            else:
                processedResults.extend(result if isinstance(result, list) else [result])
        logger.info(f"Completed parallel crawl: {len(processedResults)} results")
        return processedResults
    async def _crawlSingleUrl(
        self,
        url: str,
        urlIndex: int,
        totalUrls: int,
        instruction: str,
        maxDepth: int,
        parentOperationId: Optional[str] = None
        ) -> List[Dict[str, Any]]:
        """
        Crawl a single URL - called in parallel for multiple URLs.
        Args:
            url: URL to crawl
            urlIndex: Index of URL in the list
            totalUrls: Total number of URLs being crawled
            instruction: Research instruction
            maxDepth: Maximum crawl depth
            parentOperationId: Parent operation ID for progress tracking
        Returns:
            List of crawl results for this URL
        """
        # Create separate operation for each URL with parent reference
        urlOperationId = None
        if parentOperationId:
@ -403,13 +503,13 @@ Return ONLY valid JSON, no additional text:
            self.services.chat.progressLogStart(
                urlOperationId,
                "Web Crawl",
-                    f"URL {urlIndex + 1}",
+                f"URL {urlIndex + 1}/{totalUrls}",
                url[:50] + "..." if len(url) > 50 else url,
                parentOperationId=parentOperationId
            )
        try:
-                logger.info(f"Crawling URL {urlIndex + 1}/{len(urls)}: {url}")
+            logger.info(f"Crawling URL {urlIndex + 1}/{totalUrls}: {url}")
            if urlOperationId:
                displayUrl = url[:50] + "..." if len(url) > 50 else url
@ -481,11 +581,11 @@ Return ONLY valid JSON, no additional text:
            # Ensure it's a list of results
            if isinstance(processedResults, list):
-                    crawlResults.extend(processedResults)
+                results = processedResults
            elif isinstance(processedResults, dict):
-                    crawlResults.append(processedResults)
+                results = [processedResults]
            else:
-                    crawlResults.append({"url": url, "content": str(processedResults)})
+                results = [{"url": url, "content": str(processedResults)}]
            if urlOperationId:
                if totalUrlsCrawled > 1:
@ -494,13 +594,13 @@ Return ONLY valid JSON, no additional text:
                    self.services.chat.progressLogUpdate(urlOperationId, 0.9, "Crawl completed")
                self.services.chat.progressLogFinish(urlOperationId, True)
            return results
        except Exception as e:
            logger.error(f"Error crawling URL {url}: {str(e)}")
            if urlOperationId:
                self.services.chat.progressLogFinish(urlOperationId, False)
-                crawlResults.append({"url": url, "error": str(e)})
+            return [{"url": url, "error": str(e)}]
        return crawlResults
    def _processCrawlResultsWithHierarchy(
        self,