Merge pull request #83 from valueonag/feat/foundation-customer-use-cases

Feat/foundation customer use cases
This commit is contained in:
Patrick Motsch 2026-01-11 13:08:41 +01:00 committed by GitHub
commit b2a3d4afc8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 49 additions and 12 deletions

View file

@ -345,8 +345,8 @@ class AiTavily(BaseConnectorAi):
retryDelay = self.crawlRetryDelay
timeout = self.crawlTimeout
logger.debug(f"Starting crawl of URL: {url}")
logger.debug(f"Crawl settings: instructions={instructions}, limit={limit}, maxDepth={maxDepth}, maxBreadth={maxBreadth}, timeout={timeout}s")
logger.info(f"Starting crawl of URL: {url}")
logger.info(f"Crawl settings: instructions={instructions[:100] if instructions else None}..., limit={limit}, maxDepth={maxDepth}, maxBreadth={maxBreadth}, timeout={timeout}s")
for attempt in range(maxRetries + 1):
try:
@ -371,7 +371,7 @@ class AiTavily(BaseConnectorAi):
if maxBreadth:
kwargsCrawl["max_breadth"] = maxBreadth
logger.debug(f"Sending request to Tavily with kwargs: {kwargsCrawl}")
logger.info(f"Sending request to Tavily API with parameters: {kwargsCrawl}")
response = await asyncio.wait_for(
self.client.crawl(**kwargsCrawl),

View file

@ -115,9 +115,14 @@ class WebService:
if not validatedUrls:
return {"error": "No URLs found to crawl"}
# Step 4: Translate researchDepth to maxDepth
# Step 4: Translate researchDepth to maxDepth and maxWidth
depthMap = {"fast": 1, "general": 2, "deep": 3}
maxDepth = depthMap.get(finalResearchDepth.lower(), 2)
# Scale maxWidth based on research depth: fast=5, general=10, deep=20 pages per level
widthMap = {"fast": 5, "general": 10, "deep": 20}
maxWidth = widthMap.get(finalResearchDepth.lower(), 10)
logger.info(f"Research depth settings: depth={finalResearchDepth}, maxDepth={maxDepth}, maxWidth={maxWidth}")
# Step 5: Crawl all URLs with hierarchical logging
if operationId:
@ -131,6 +136,7 @@ class WebService:
instruction=instruction,
urls=validatedUrls,
maxDepth=maxDepth,
maxWidth=maxWidth, # Pass maxWidth to crawl function
parentOperationId=parentOperationId
)
@ -440,6 +446,7 @@ Return ONLY valid JSON, no additional text:
instruction: str,
urls: List[str],
maxDepth: int = 2,
maxWidth: int = 10,
parentOperationId: Optional[str] = None
) -> List[Dict[str, Any]]:
"""Perform web crawl on list of URLs - crawls URLs in parallel for better performance."""
@ -452,6 +459,7 @@ Return ONLY valid JSON, no additional text:
totalUrls=len(urls),
instruction=instruction,
maxDepth=maxDepth,
maxWidth=maxWidth, # Pass maxWidth to single URL crawl
parentOperationId=parentOperationId
)
crawlTasks.append(task)
@ -479,6 +487,7 @@ Return ONLY valid JSON, no additional text:
totalUrls: int,
instruction: str,
maxDepth: int,
maxWidth: int = 10,
parentOperationId: Optional[str] = None
) -> List[Dict[str, Any]]:
"""
@ -517,11 +526,13 @@ Return ONLY valid JSON, no additional text:
self.services.chat.progressLogUpdate(urlOperationId, 0.3, "Initiating crawl")
# Build crawl prompt model for single URL
# maxWidth is passed from performWebResearch based on researchDepth
logger.info(f"Crawling URL {urlIndex + 1}/{totalUrls} with maxDepth={maxDepth}, maxWidth={maxWidth}")
crawlPromptModel = AiCallPromptWebCrawl(
instruction=instruction,
url=url, # Single URL
maxDepth=maxDepth,
maxWidth=5 # Default: 5 pages per level
maxWidth=maxWidth # Scaled based on researchDepth: fast=5, general=10, deep=20
)
crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)

View file

@ -779,36 +779,62 @@ class ContentValidator:
actionHistoryContext = f"\n\n=== ACTION HISTORY ===\n" + "\n".join(f"- {entry}" for entry in historyEntries)
actionHistoryContext += "\n\nIMPORTANT: This shows the complete workflow that produced the documents. For process-oriented criteria (e.g., 'internet search performed'), check ACTION HISTORY first. Document metadata may only reflect the LAST action, not the entire workflow."
# Build document index context (all documents delivered in current round)
# Build document index context (all documents delivered in current round AND past rounds)
# CRITICAL: Include past rounds so validator can see files produced in previous rounds
# This prevents endless loops when files are produced in multiple sets across rounds
documentIndexContext = ""
if context and self.services and hasattr(self.services, 'chat') and hasattr(self.services, 'workflow') and self.services.workflow:
try:
documentIndex = self.services.chat.getAvailableDocuments(self.services.workflow)
if documentIndex and documentIndex.strip() and documentIndex != "No documents available":
# Extract only "Current round documents" section if present
# Extract BOTH "Current round documents" AND "Past rounds documents" sections
lines = documentIndex.split('\n')
currentRoundSection = []
pastRoundsSection = []
inCurrentRound = False
inPastRounds = False
for line in lines:
if "Current round documents:" in line:
inCurrentRound = True
inPastRounds = False
currentRoundSection.append(line)
elif "Past rounds documents:" in line:
inCurrentRound = False
inPastRounds = True
pastRoundsSection.append(line)
elif "AVAILABLE_CONNECTIONS_INDEX:" in line:
# End of document sections
break
elif inCurrentRound:
if line.strip().startswith("- docList:") or line.strip().startswith(" - docItem:") or line.strip().startswith("- docItem:"):
currentRoundSection.append(line)
elif line.strip() == "":
# Empty line is okay, continue
continue
elif "Past rounds documents:" in line or "AVAILABLE_CONNECTIONS_INDEX:" in line:
# End of current round section
break
else:
# Still in current round section
currentRoundSection.append(line)
elif inPastRounds:
if line.strip().startswith("- docList:") or line.strip().startswith(" - docItem:") or line.strip().startswith("- docItem:"):
pastRoundsSection.append(line)
elif line.strip() == "":
# Empty line is okay, continue
continue
else:
# Still in past rounds section
pastRoundsSection.append(line)
# Build context with both sections
sections = []
if currentRoundSection:
documentIndexContext = "\n\n=== ALL DOCUMENTS DELIVERED IN CURRENT ROUND ===\n" + "\n".join(currentRoundSection)
documentIndexContext += "\n\nIMPORTANT: This shows ALL documents that have been delivered in the current round, not just the ones being validated in this step. Use this to check if all required formats/documents are present across the entire round."
sections.append("=== ALL DOCUMENTS DELIVERED IN CURRENT ROUND ===\n" + "\n".join(currentRoundSection))
if pastRoundsSection:
sections.append("=== ALL DOCUMENTS DELIVERED IN PAST ROUNDS ===\n" + "\n".join(pastRoundsSection))
if sections:
documentIndexContext = "\n\n" + "\n\n".join(sections)
documentIndexContext += "\n\nIMPORTANT: This shows ALL documents that have been delivered in the current round AND past rounds, not just the ones being validated in this step. Use this to check if all required formats/documents are present across ALL rounds. Files produced in previous rounds (e.g., html+css in round 1, python in round 2) should all be visible here."
except Exception as e:
logger.warning(f"Error extracting document index for validation: {str(e)}")
# Continue without document index - not critical