diff --git a/docs/WEBSEARCH_FIXES.md b/docs/WEBSEARCH_FIXES.md new file mode 100644 index 00000000..420a9845 --- /dev/null +++ b/docs/WEBSEARCH_FIXES.md @@ -0,0 +1,134 @@ +# Web Search Content Extraction Fixes + +## Problem Summary + +The Tavily web search integration was failing to extract content from search results, causing web research to return empty or incomplete data. The main issues were related to handling `None` values and incomplete error recovery. + +## Main Issues Fixed + +### 1. Incomplete Content Extraction from Search Results + +**Problem:** +- When Tavily API returned search results, some results had `raw_content` set to `None` (not missing, but explicitly `None`) +- The code used `result.get("raw_content") or result.get("content", "")` which failed when `raw_content` existed but was `None` +- This caused `None` values to propagate through the system instead of falling back to the `content` field or empty string + +**Fix:** +Changed the content extraction in `aicorePluginTavily.py` to properly handle `None` values: +```python +# Before (line 344): +rawContent=result.get("raw_content") or result.get("content", "") + +# After: +rawContent=result.get("raw_content") or result.get("content") or "" +``` + +This ensures that if `raw_content` is `None`, it falls back to `content`, and if that's also `None`, it defaults to an empty string. + +**Additional Fix:** +Added defensive checks in the `webSearch` method to safely extract content even when result objects have unexpected structures: +```python +# Safely extract content with multiple fallbacks +content = "" +if hasattr(result, 'rawContent'): + content = result.rawContent or "" +if not content and hasattr(result, 'content'): + content = result.content or "" +``` + +### 2. NoneType Error When Logging Content Length + +**Problem:** +- Code attempted to check `len(first_result.get('raw_content', ''))` for logging +- When `raw_content` key existed but value was `None`, `.get()` returned `None` instead of the default `''` +- This caused `len(None)` to fail with `TypeError: object of type 'NoneType' has no len()` + +**Fix:** +Changed the logging code to safely handle `None` values: +```python +# Before (line 338): +logger.debug(f"First result has raw_content: {'raw_content' in first_result}, content length: {len(first_result.get('raw_content', ''))}") + +# After: +raw_content = first_result.get('raw_content') or '' +logger.debug(f"First result has raw_content: {'raw_content' in first_result}, content length: {len(raw_content)}") +``` + +### 3. Missing Error Recovery in Content Extraction + +**Problem:** +- When processing search results, if one result failed to extract, the entire extraction could fail +- No recovery mechanism to extract at least URLs even when content extraction failed +- Errors were logged but processing stopped, losing potentially useful data + +**Fix:** +Added per-result error handling with recovery: +```python +for result in searchResults: + try: + # Extract URL, content, title safely + # ... extraction logic ... + except Exception as resultError: + logger.warning(f"Error processing individual search result: {resultError}") + # Continue processing other results instead of failing completely + continue +``` + +Also added recovery at the extraction level: +```python +except Exception as extractionError: + logger.error(f"Error extracting URLs and content from search results: {extractionError}") + # Try to recover at least URLs + try: + urls = [result.url for result in searchResults if hasattr(result, 'url') and result.url] + logger.info(f"Recovered {len(urls)} URLs after extraction error") + except Exception: + logger.error("Failed to recover any URLs from search results") +``` + +### 4. Incomplete Crawl Result Processing + +**Problem:** +- When crawl returned results but individual page processing failed, entire crawl was lost +- No fallback to extract at least URLs from failed crawl results +- Missing content fields could cause errors when formatting results + +**Fix:** +Added error handling for individual page processing: +```python +for i, result in enumerate(crawlResults, 1): + try: + # Format page content + # ... formatting logic ... + except Exception as pageError: + logger.warning(f"Error formatting page {i} from crawl: {pageError}") + # Try to add at least the URL + try: + pageUrls.append(result.url if hasattr(result, 'url') and result.url else webCrawlPrompt.url) + except Exception: + pass +``` + +Also ensured all result fields have safe defaults: +```python +results.append(WebCrawlResult( + url=result_url or url, # Fallback to base URL + content=result_content, # Already ensured to be string + title=result_title # Already ensured to be string +)) +``` + +## Impact + +These fixes ensure that: +1. **Content is always extracted** - Even when `raw_content` is `None`, the system falls back to `content` field or empty string +2. **Partial results are preserved** - If some results fail, others are still processed and returned +3. **URLs are recovered** - Even when content extraction fails completely, URLs can still be extracted for crawling +4. **No crashes from None values** - All `None` values are properly handled before operations like `len()` are called + +## Testing Recommendations + +- Test with Tavily search results that have `raw_content` set to `None` +- Test with mixed results (some with content, some without) +- Test error recovery when individual results fail +- Verify that URLs are still extracted even when content extraction fails diff --git a/modules/aicore/aicorePluginTavily.py b/modules/aicore/aicorePluginTavily.py index 90718683..ffab7adc 100644 --- a/modules/aicore/aicorePluginTavily.py +++ b/modules/aicore/aicorePluginTavily.py @@ -321,13 +321,28 @@ class AiTavily(BaseConnectorAi): # Return all results without score filtering # Tavily's scoring is already applied by the API - logger.info(f"Tavily returned {len(response.get('results', []))} results") + results_count = len(response.get('results', [])) + logger.info(f"Tavily returned {results_count} results") + + # Log content availability + results_with_content = 0 + for result in response.get('results', []): + if result.get("raw_content"): + results_with_content += 1 + logger.info(f"Tavily results with raw_content: {results_with_content}/{results_count}") + + # Log first result structure for debugging + if response.get('results') and len(response['results']) > 0: + first_result = response['results'][0] + logger.debug(f"First result keys: {list(first_result.keys())}") + raw_content = first_result.get('raw_content') or '' + logger.debug(f"First result has raw_content: {'raw_content' in first_result}, content length: {len(raw_content)}") return [ WebSearchResult( - title=result["title"], - url=self._cleanUrl(result["url"]), - rawContent=result.get("raw_content") + title=result.get("title", ""), + url=self._cleanUrl(result.get("url", "")), + rawContent=result.get("raw_content") or result.get("content") or "" ) for result in response["results"] ] @@ -381,24 +396,90 @@ class AiTavily(BaseConnectorAi): logger.debug(f"Tavily response received: {type(response)}") # Parse response - could be dict with results or list - if isinstance(response, dict) and "results" in response: - pageResults = response["results"] + if isinstance(response, dict): + if "results" in response: + pageResults = response["results"] + logger.debug(f"Found 'results' key in response dict with {len(pageResults)} items") + else: + logger.warning(f"Response dict keys: {list(response.keys())}") + # Check for other possible keys + if "pages" in response: + pageResults = response["pages"] + logger.debug(f"Found 'pages' key with {len(pageResults)} items") + elif "content" in response: + # Single page result + pageResults = [response] + logger.debug("Found 'content' key, treating as single page result") + else: + logger.warning(f"Unexpected response dict structure: {list(response.keys())}") + pageResults = [] elif isinstance(response, list): pageResults = response + logger.debug(f"Response is a list with {len(pageResults)} items") else: - logger.warning(f"Unexpected response format: {type(response)}") + logger.warning(f"Unexpected response format: {type(response)}, value: {str(response)[:200]}") pageResults = [] - logger.debug(f"Got {len(pageResults)} pages from crawl") + logger.info(f"Got {len(pageResults)} pages from crawl for URL: {url}") + if len(pageResults) == 0: + logger.warning(f"Tavily crawl returned 0 pages for URL: {url}. Response structure: {type(response)}") + if isinstance(response, dict): + logger.warning(f"Response keys: {list(response.keys())}") + # Log all values to debug (not just first 3) + for key, value in response.items(): + value_str = str(value) + if len(value_str) > 200: + value_str = value_str[:200] + "..." + logger.warning(f" {key}: {type(value)} - {value_str}") + + # Check for error messages in response + if "error" in response: + logger.error(f"Tavily API error in response: {response.get('error')}") + if "message" in response: + logger.warning(f"Tavily API message: {response.get('message')}") + elif isinstance(response, str): + logger.warning(f"Tavily returned string response (first 500 chars): {response[:500]}") + else: + logger.warning(f"Unexpected response type: {type(response)}, value: {str(response)[:500]}") - # Convert to WebCrawlResult format + # Convert to WebCrawlResult format with error handling results = [] - for result in pageResults: - results.append(WebCrawlResult( - url=result.get("url", url), - content=result.get("raw_content", result.get("content", "")), - title=result.get("title", "") - )) + for idx, result in enumerate(pageResults): + try: + # Safely extract fields + result_url = result.get("url") if isinstance(result, dict) else (getattr(result, "url", None) if hasattr(result, "url") else url) + result_content = "" + if isinstance(result, dict): + result_content = result.get("raw_content") or result.get("content") or "" + elif hasattr(result, "raw_content"): + result_content = result.raw_content or "" + elif hasattr(result, "content"): + result_content = result.content or "" + + result_title = "" + if isinstance(result, dict): + result_title = result.get("title", "") + elif hasattr(result, "title"): + result_title = result.title or "" + + results.append(WebCrawlResult( + url=result_url or url, + content=result_content, + title=result_title + )) + except Exception as resultError: + logger.warning(f"Error processing crawl result {idx}: {resultError}") + # Try to create a minimal result with at least the URL + try: + if isinstance(result, dict) and result.get("url"): + results.append(WebCrawlResult( + url=result.get("url", url), + content="", + title="" + )) + except Exception: + logger.error(f"Failed to create minimal result for crawl result {idx}") + continue logger.debug(f"Crawl successful: extracted {len(results)} pages from URL") return results @@ -413,7 +494,7 @@ class AiTavily(BaseConnectorAi): except Exception as e: logger.warning(f"Crawl attempt {attempt + 1} failed for URL {url}: {str(e)}") - logger.debug(f"Full error details: {type(e).__name__}: {str(e)}") + logger.debug(f"Full error details: {type(e).__name__}: {str(e)}", exc_info=True) # Check if it's a validation error and log more details if "validation" in str(e).lower(): @@ -427,10 +508,22 @@ class AiTavily(BaseConnectorAi): if len(url) > 2000: logger.debug(f" WARNING: URL is very long ({len(url)} chars)") + # Log API-specific errors + error_str = str(e).lower() + if "rate limit" in error_str or "429" in error_str: + logger.error(f"Tavily API rate limit hit for URL: {url}") + elif "401" in error_str or "unauthorized" in error_str: + logger.error(f"Tavily API authentication failed for URL: {url}") + elif "404" in error_str or "not found" in error_str: + logger.warning(f"URL not found (404) for: {url}") + elif "timeout" in error_str: + logger.warning(f"Timeout error for URL: {url}") + if attempt < maxRetries: logger.info(f"Retrying in {retryDelay} seconds...") await asyncio.sleep(retryDelay) else: + logger.error(f"Crawl failed after {maxRetries + 1} attempts for URL: {url}") raise Exception(f"Crawl failed after {maxRetries + 1} attempts: {str(e)}") async def _routeWebOperation(self, modelCall: AiModelCall) -> "AiModelResponse": @@ -508,21 +601,84 @@ class AiTavily(BaseConnectorAi): includeRawContent="text" ) - # Extract URLs from results - urls = [result.url for result in searchResults] + # Extract URLs and content from results with error handling + urls = [] + results_with_content = [] + content_count = 0 - # Return as JSON array + try: + for result in searchResults: + try: + # Safely extract URL + url = result.url if hasattr(result, 'url') and result.url else "" + if url: + urls.append(url) + + # Safely extract content + content = "" + if hasattr(result, 'rawContent'): + content = result.rawContent or "" + if not content and hasattr(result, 'content'): + content = result.content or "" + + if content: + content_count += 1 + + # Safely extract title + title = result.title if hasattr(result, 'title') and result.title else "" + + results_with_content.append({ + "url": url, + "title": title, + "content": content, + "score": getattr(result, 'score', 0) + }) + except Exception as resultError: + logger.warning(f"Error processing individual search result: {resultError}") + # Continue processing other results + continue + + logger.info(f"Tavily search: {len(urls)} URLs, {content_count} with content, {len(results_with_content)} total results") + if content_count == 0: + logger.warning("Tavily search returned no content - results may need crawling") + except Exception as extractionError: + logger.error(f"Error extracting URLs and content from search results: {extractionError}") + # Try to recover at least URLs + try: + urls = [result.url for result in searchResults if hasattr(result, 'url') and result.url] + logger.info(f"Recovered {len(urls)} URLs after extraction error") + except Exception: + logger.error("Failed to recover any URLs from search results") + + # Return both URLs and full results in JSON for direct extraction + # Format: {"urls": [...], "results": [...]} import json + response_data = { + "urls": urls, + "results": results_with_content + } + return AiModelResponse( - content=json.dumps(urls, indent=2), + content=json.dumps(response_data, indent=2), success=True, - metadata={"total_urls": len(urls), "operation": "WEB_SEARCH_DATA"} + metadata={ + "total_urls": len(urls), + "operation": "WEB_SEARCH_DATA", + "results_with_content": results_with_content # Also in metadata for compatibility + } ) except Exception as e: - logger.error(f"Error in Tavily web search: {str(e)}") + logger.error(f"Error in Tavily web search: {str(e)}", exc_info=True) + import json + # Return error response with empty results + error_response = { + "urls": [], + "results": [], + "error": str(e) + } return AiModelResponse( - content="[]", + content=json.dumps(error_response, indent=2), success=False, error=str(e) ) @@ -575,23 +731,44 @@ class AiTavily(BaseConnectorAi): # If we got multiple pages from the crawl, we need to format them differently # Return the first result for backwards compatibility, but include total page count if crawlResults and len(crawlResults) > 0: - # Get all pages content + # Get all pages content with error handling allContent = "" + pageUrls = [] for i, result in enumerate(crawlResults, 1): - pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n" - if result.title: - allContent += f"{pageHeader}Title: {result.title}\n\n" - allContent += f"{result.content}\n" + try: + pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n" + if result.title: + allContent += f"{pageHeader}Title: {result.title}\n\n" + else: + allContent += f"{pageHeader}\n" + allContent += f"{result.content or ''}\n" + pageUrls.append(result.url) + except Exception as pageError: + logger.warning(f"Error formatting page {i} from crawl: {pageError}") + # Try to add at least the URL + try: + pageUrls.append(result.url if hasattr(result, 'url') and result.url else webCrawlPrompt.url) + except Exception: + pass resultData = { "url": webCrawlPrompt.url, - "title": crawlResults[0].title if crawlResults[0].title else "Content", + "title": crawlResults[0].title if crawlResults and crawlResults[0].title else "Content", "content": allContent, "pagesCrawled": len(crawlResults), - "pageUrls": [result.url for result in crawlResults] + "pageUrls": pageUrls } + logger.info(f"Crawl successful: {len(crawlResults)} pages extracted from {webCrawlPrompt.url}") else: - resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted", "pagesCrawled": 0} + logger.warning(f"Crawl returned no results for URL: {webCrawlPrompt.url}") + resultData = { + "url": webCrawlPrompt.url, + "title": "", + "content": "", + "error": "No content extracted - Tavily crawl returned 0 pages", + "pagesCrawled": 0, + "pageUrls": [] + } # Return as JSON - same format as Perplexity but with multiple pages content import json @@ -602,9 +779,17 @@ class AiTavily(BaseConnectorAi): ) except Exception as e: - logger.error(f"Error in Tavily web crawl: {str(e)}") + logger.error(f"Error in Tavily web crawl: {str(e)}", exc_info=True) import json - errorResult = {"error": str(e), "url": webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""} + crawl_url = webCrawlPrompt.url if 'webCrawlPrompt' in locals() else "" + errorResult = { + "url": crawl_url, + "title": "", + "content": "", + "error": str(e), + "pagesCrawled": 0, + "pageUrls": [] + } return AiModelResponse( content=json.dumps(errorResult, indent=2), success=False, diff --git a/modules/features/chatbot/chatbotConstants.py b/modules/features/chatbot/chatbotConstants.py index 633528fa..7b3b6cec 100644 --- a/modules/features/chatbot/chatbotConstants.py +++ b/modules/features/chatbot/chatbotConstants.py @@ -158,12 +158,64 @@ WICHTIG: Wenn du Informationen aus der Datenbank präsentierst, kennzeichne dies - Beginne deine Antwort mit einer klaren Kennzeichnung, z.B.: "Aus der Datenbank habe ich folgende Artikel gefunden:" - Bei kombinierten Informationen (Datenbank + Internet): Trenne klar zwischen beiden Quellen -QUELLENANGABE - INTERNET: -WICHTIG: Wenn du Informationen aus dem Internet präsentierst, kennzeichne dies IMMER klar für den Nutzer. -- Beginne Internet-Recherchen mit: "Aus meiner Internet-Recherche:" oder "Laut Online-Quellen:" -- Gib IMMER die konkreten Quellen an (Website-Namen und Links) -- Bei mehreren Quellen: Liste die Quellen auf und verweise darauf -- Trenne klar zwischen Datenbank-Informationen und Internet-Recherchen +⚠️⚠️⚠️ QUELLENANGABE - INTERNET - ABSOLUT VERBINDLICH ⚠️⚠️⚠️ +Wenn du Informationen aus einer Web-Recherche präsentierst, MUSS du dies IMMER explizit kennzeichnen und die Quellen angeben: +- ❌ VERBOTEN: Informationen aus Web-Recherchen ohne explizite Kennzeichnung zu präsentieren +- ❌ VERBOTEN: Informationen aus Web-Recherchen ohne Quellenangabe zu präsentieren +- ❌ VERBOTEN: Quellen nur am Ende als Liste zu präsentieren +- ✓ OBLIGATORISCH: Beginne IMMER mit einer expliziten Kennzeichnung, z.B.: + * "Aus meiner Web-Recherche habe ich folgende Informationen gefunden:" + * "Laut meiner Internet-Recherche:" + * "Aus meiner Online-Suche:" +- ✓ OBLIGATORISCH: Gib IMMER die konkreten Quellen DIREKT NACH der jeweiligen Information an (nicht am Ende!) +- ✓ OBLIGATORISCH: Format: [Information] ([Quelle: Website-Name](URL)) +- ✓ OBLIGATORISCH: Bei mehreren Informationen: Gib nach JEDER Information die entsprechende Quelle an +- ✓ OBLIGATORISCH: Trenne klar zwischen Datenbank-Informationen und Web-Recherchen +- ✓ OBLIGATORISCH: Wenn sowohl Datenbank- als auch Web-Informationen vorhanden sind, trenne diese klar in separaten Abschnitten + +⚠️⚠️⚠️ DATENBLATT-LINKS - ABSOLUT VERBINDLICH ⚠️⚠️⚠️ +Wenn Web-Recherche-Ergebnisse vorhanden sind, MUSS du IMMER: +- ✓ OBLIGATORISCH: Explizit erwähnen, dass Datenblätter verfügbar sind +- ✓ OBLIGATORISCH: ALLE verfügbaren Datenblatt-Links angeben (vollständige URLs) +- ✓ OBLIGATORISCH: Format: "Datenblätter verfügbar: [Link 1](URL1), [Link 2](URL2)" +- ✓ OBLIGATORISCH: Wenn keine direkten Datenblatt-Links vorhanden sind, gib Links zu Seiten mit technischen Informationen an +- ❌ VERBOTEN: Datenblatt-Links zu verschweigen oder nicht explizit zu erwähnen + +⚠️⚠️⚠️ AUSFÜHRLICHE INFORMATIONEN - ABSOLUT VERBINDLICH ⚠️⚠️⚠️ +Wenn Web-Recherche-Ergebnisse vorhanden sind, MUSS du: +- ✓ OBLIGATORISCH: AUSFÜHRLICHE Informationen präsentieren (nicht nur kurze Zusammenfassungen!) +- ✓ OBLIGATORISCH: Alle relevanten technischen Details angeben: + * Technische Spezifikationen (Größe, Gewicht, Abmessungen, etc.) + * Betriebsbedingungen (Temperatur, Spannung, etc.) + * Kompatibilität und Anwendungsbereiche + * Zertifizierungen und Normen + * Installation und Verwendung + * Weitere relevante Produktdetails +- ✓ OBLIGATORISCH: Strukturiere die Informationen übersichtlich (z.B. mit Abschnitten oder Aufzählungen) +- ❌ VERBOTEN: Nur oberflächliche Informationen zu geben +- ❌ VERBOTEN: Wichtige Details auszulassen + +BEISPIEL FÜR KORREKTE QUELLENANGABE MIT INLINE-QUELLEN: +"Aus meiner Web-Recherche habe ich folgende Informationen gefunden: + +**Technische Spezifikationen:** +- Speicherkapazität: 2 GB ([Quelle: Siemens Support](https://...)) +- Format: Secure Digital (SD) Card ([Quelle: Best4Automation](https://...)) +- Betriebsspannung: 3,3 V DC ([Quelle: Automation24](https://...)) + +**Kompatibilität:** +- Geeignet für SIMATIC HMI Comfort Panels ([Quelle: Siemens Support](https://...)) +- Montage im Hoch- und Querformat möglich ([Quelle: Best4Automation](https://...)) + +**Zertifizierungen:** +- CE-zertifiziert ([Quelle: Automation24](https://...)) +- Für ATEX-Zonen geeignet ([Quelle: Elit](https://...)) + +**Datenblätter verfügbar:** +- [Siemens Produktdatenblatt](https://...) +- [Technische Dokumentation](https://...)" + +NIEMALS Informationen aus Web-Recherchen präsentieren, ohne explizit zu erwähnen, dass es sich um eine Web-Recherche handelt und ohne die Quellen DIREKT NACH der jeweiligen Information anzugeben! TABELLENLÄNGE UND ARTIKELANZAHL - KRITISCH: WICHTIG: Zeige MAXIMAL 20 Artikel in Tabellen. Du darfst und sollst aber ausführliche Erklärungen liefern! @@ -316,6 +368,17 @@ Analysiere die Benutzeranfrage und bestimme: - Jede Abfrage sollte fokussiert sein und die benötigten Informationen aus einer spezifischen Tabelle/Datenquelle abrufen 4. Begründung für deine Entscheidung +⚠️⚠️⚠️ WICHTIG - WEB-RECHERCHE BEI ZUSÄTZLICHEN INFORMATIONEN ⚠️⚠️⚠️ +Wenn der Nutzer nach zusätzlichen Informationen fragt oder explizit eine Recherche anfordert, MUSS IMMER eine Web-Recherche durchgeführt werden (needsWebResearch = true). +Beispiele für solche Anfragen: +- "recherchier nach weiteren informationen zu diesem produkt" +- "suche nach zusätzlichen informationen" +- "finde mehr details" +- "recherchiere im internet" +- "suche online nach" +- Ähnliche Formulierungen, die eine Recherche oder zusätzliche Informationen anfordern +In diesen Fällen IMMER needsWebResearch auf true setzen! + WICHTIG für SQL-Abfragen: - Verwende IMMER doppelte Anführungszeichen für Spaltennamen - Bei Lagerbestandsabfragen: IMMER S_RESERVIERTER__BESTAND und verfügbaren Bestand einbeziehen @@ -550,6 +613,21 @@ Antworte auf die folgende Frage des Nutzers: {user_prompt}{context} KRITISCH: Verwende NUR die oben angegebenen Daten. Erfinde KEINE Werte. Wenn Daten fehlen, schreibe "Nicht verfügbar". +⚠️⚠️⚠️ ABSOLUT KRITISCH - WEB-RECHERCHE QUELLENANGABE ⚠️⚠️⚠️ +Wenn WEB-RECHERCHE-ERGEBNISSE oben vorhanden sind, MUSS du: +- ✓ IMMER explizit erwähnen, dass die Informationen aus einer Web-Recherche stammen +- ✓ IMMER alle Quellen DIREKT NACH der jeweiligen Information angeben (INLINE, nicht am Ende!) +- ✓ Format: [Information] ([Quelle: Website-Name](URL)) +- ✓ IMMER AUSFÜHRLICHE Informationen präsentieren (nicht nur kurze Zusammenfassungen!) +- ✓ IMMER alle verfügbaren Datenblatt-Links explizit erwähnen und angeben +- ✓ Format für Datenblätter: "Datenblätter verfügbar: [Link 1](URL1), [Link 2](URL2)" +- ✓ Die Web-Recherche-Informationen klar von Datenbank-Informationen trennen +- ❌ VERBOTEN: Web-Recherche-Informationen ohne explizite Kennzeichnung zu präsentieren +- ❌ VERBOTEN: Web-Recherche-Informationen ohne Quellenangabe zu präsentieren +- ❌ VERBOTEN: Quellen nur am Ende als Liste zu präsentieren +- ❌ VERBOTEN: Datenblatt-Links zu verschweigen oder nicht explizit zu erwähnen +- ❌ VERBOTEN: Nur oberflächliche Informationen zu geben + ⚠️⚠️⚠️ ABSOLUT VERBOTEN - KEINE DATEN ERFINDEN ⚠️⚠️⚠️ Wenn KEINE Datenbank-Ergebnisse vorhanden sind (keine DATENBANK-ERGEBNISSE oder STRUKTURIERTE DATEN oben), dann: - ❌ ERFINDE KEINE Artikelnummern, Artikelbezeichnungen, Preise oder Lagerbestände! diff --git a/modules/features/chatbot/mainChatbot.py b/modules/features/chatbot/mainChatbot.py index acb9a0e6..6f0cf770 100644 --- a/modules/features/chatbot/mainChatbot.py +++ b/modules/features/chatbot/mainChatbot.py @@ -315,6 +315,210 @@ async def _check_workflow_stopped(interfaceDbChat, workflowId: str) -> bool: return False +def _buildWebResearchQuery(userPrompt: str, workflowMessages: List, queryResults: Optional[Dict[str, Any]] = None) -> str: + """ + Build enriched web research query by extracting product context from conversation history and current prompt. + + Extracts product information from: + 1. Current user prompt (article numbers, product mentions) + 2. Database query results (if available) + 3. Previous assistant messages (conversation history) + + Args: + userPrompt: Current user prompt + workflowMessages: List of workflow messages (conversation history) + queryResults: Optional database query results to extract product info from + + Returns: + Enriched search query string + """ + # Normalize user prompt for detection + prompt_lower = userPrompt.lower().strip() + + # Patterns that indicate a search request + search_patterns = [ + "ja", "yes", "oui", "si", + "such", "suche", "search", "recherche", "recherchier", + "internet", "web", "online", + "datenblatt", "datasheet", "fiche technique", + "mehr informationen", "more information", "plus d'information", + "weitere informationen", "further information", "additional information" + ] + + # Check if current prompt contains search-related keywords + has_search_intent = any(pattern in prompt_lower for pattern in search_patterns) + + # Extract product information - try multiple sources + article_number = None + article_description = None + supplier = None + + # Pattern for article numbers like "6AV2 181-8XP00-0AX0" or "6AV2181-8XP00-0AX0" + article_patterns = [ + r'\b[A-Z0-9]{2,}\s+[0-9]{3,}-[A-Z0-9-]+\b', # With space: "6AV2 181-8XP00-0AX0" + r'\b[A-Z0-9]{4,}[\s-][A-Z0-9-]{6,}\b', # General pattern + r'\b[A-Z]{2,}[0-9]+\s+[0-9]+-[A-Z0-9-]+\b', # Specific Siemens pattern + ] + + # 1. First, try to extract from current user prompt + for pattern in article_patterns: + matches = re.findall(pattern, userPrompt) + if matches: + article_number = matches[0] + logger.info(f"Extracted article number from user prompt: {article_number}") + break + + # 2. Try to extract from database query results if available + # Always check queryResults to enrich with product description and supplier, even if article_number was already found + if queryResults: + # Look for article numbers in query result text (if not already found) + if not article_number: + for key in queryResults.keys(): + if key.startswith("query_") and not key.endswith("_error") and not key.endswith("_data"): + result_text = queryResults.get(key, "") + if isinstance(result_text, str): + for pattern in article_patterns: + matches = re.findall(pattern, result_text) + if matches: + article_number = matches[0] + logger.info(f"Extracted article number from query results: {article_number}") + break + if article_number: + break + + # Always check data arrays for product description and supplier (even if article_number already found) + for key in queryResults.keys(): + if key.startswith("query_") and not key.endswith("_error") and not key.endswith("_data"): + data_key = f"{key}_data" + if data_key in queryResults: + data_array = queryResults[data_key] + if isinstance(data_array, list) and len(data_array) > 0: + # Look for article number in first row (if not already found) + first_row = data_array[0] + if isinstance(first_row, dict): + # Check common article number fields (if not already found) + if not article_number: + for field in ["Artikelnummer", "Artikelkürzel", "article_number", "articleNumber"]: + if field in first_row and first_row[field]: + article_number = str(first_row[field]) + logger.info(f"Extracted article number from query data: {article_number}") + break + + # Always check article description (can enrich even if article_number already found) + if not article_description: + for field in ["Artikelbezeichnung", "Bezeichnung", "article_description", "description"]: + if field in first_row and first_row[field]: + article_description = str(first_row[field]) + logger.info(f"Extracted article description from query data: {article_description}") + break + + # Always check supplier (can enrich even if article_number already found) + if not supplier: + for field in ["Lieferant", "Supplier", "supplier"]: + if field in first_row and first_row[field]: + supplier = str(first_row[field]) + logger.info(f"Extracted supplier from query data: {supplier}") + break + + # If we found all needed info, we can stop + if article_number and article_description and supplier: + break + + # 3. Extract from previous assistant messages (conversation history) + if not article_number or not article_description: + for msg in reversed(workflowMessages[-10:]): + if msg.role == "assistant": + message_text = msg.message + + # Extract article number if not found yet + if not article_number: + for pattern in article_patterns: + matches = re.findall(pattern, message_text) + if matches: + article_number = matches[0] + break + + # Extract article description if not found yet + if not article_description: + description_patterns = [ + r'Es handelt sich um\s+([^\.]+)', + r'It is a\s+([^\.]+)', + r'C\'est\s+([^\.]+)', + r'Bezeichnung:\s*([^\n]+)', + r'Description:\s*([^\n]+)', + r'Artikelbezeichnung:\s*([^\n]+)', + r'Artikelbezeichnung:\s*([^\n]+)' + ] + for pattern in description_patterns: + match = re.search(pattern, message_text, re.IGNORECASE) + if match: + article_description = match.group(1).strip() + break + + # Extract supplier if not found yet + if not supplier: + supplier_patterns = [ + r'von\s+([A-Z][A-Za-z\s]+(?:AG|GmbH|Ltd|Inc|Corp)?)', + r'from\s+([A-Z][A-Za-z\s]+(?:AG|GmbH|Ltd|Inc|Corp)?)', + r'Lieferant:\s*([^\n]+)', + r'Supplier:\s*([^\n]+)' + ] + for pattern in supplier_patterns: + match = re.search(pattern, message_text, re.IGNORECASE) + if match: + supplier = match.group(1).strip() + break + + # Stop if we found everything + if article_number and article_description and supplier: + break + + # Build enriched search query + query_parts = [] + + # If we have search intent but no product info, try to use the user prompt intelligently + if has_search_intent and not article_number and not article_description: + # Try to extract meaningful parts from the prompt + # Remove common search phrases and keep the product-related parts + cleaned_prompt = userPrompt + for phrase in ["recherchier nach", "recherche", "suche nach", "search for", "find", "informationen zu", "information about", "weitere informationen", "further information"]: + cleaned_prompt = re.sub(phrase, "", cleaned_prompt, flags=re.IGNORECASE) + cleaned_prompt = cleaned_prompt.strip() + + # If cleaned prompt still has content and is different, use it + if cleaned_prompt and cleaned_prompt != userPrompt and len(cleaned_prompt) > 10: + query_parts.append(cleaned_prompt) + + # Add article description if found + if article_description: + query_parts.append(article_description) + + # Add article number if found + if article_number: + query_parts.append(article_number) + + # Add supplier if found + if supplier: + query_parts.append(supplier) + + # Add "Datenblatt" or "datasheet" if user requested it or if we have product info + if "datenblatt" in prompt_lower or "datasheet" in prompt_lower or "fiche technique" in prompt_lower: + query_parts.append("Datenblatt") + elif query_parts: + # If we have product info but no explicit request for datasheet, add it anyway + query_parts.append("Datenblatt") + + # If we found product information or built a meaningful query, use it + if query_parts: + enriched_query = " ".join(query_parts) + logger.info(f"Built enriched search query: '{enriched_query}' from context (original: '{userPrompt}')") + return enriched_query + else: + # Fall back to original prompt, but try to clean it up + logger.info(f"No product context found, using original prompt: '{userPrompt}'") + return userPrompt + + async def _processChatbotMessage( services, workflowId: str, @@ -398,6 +602,11 @@ async def _processChatbotMessage( logger.info(f"Analysis: DB={needsDatabaseQuery}, Web={needsWebResearch}, SQL queries={len(sql_queries)}") + # Build initial enriched web research query if needed (for logging, will be rebuilt after DB queries) + enriched_web_query = None + if needsWebResearch: + enriched_web_query = _buildWebResearchQuery(userInput.prompt, workflow.messages) + # Build list of queries to stream back queries = [] @@ -414,7 +623,7 @@ async def _processChatbotMessage( if needsWebResearch: queries.append({ "type": "web", - "query": userInput.prompt, + "query": enriched_web_query or userInput.prompt, "reasoning": reasoning }) @@ -499,8 +708,17 @@ async def _processChatbotMessage( await _emit_log_and_event(interfaceDbChat, workflowId, event_manager, "Suche im Internet nach Informationen...") try: + # Rebuild enriched query with database results if available (better product context) + web_research_query = _buildWebResearchQuery( + userInput.prompt, + workflow.messages, + queryResults if queryResults else None + ) + + logger.info(f"Using enriched web research query: '{web_research_query}'") + researchResult = await services.web.performWebResearch( - prompt=userInput.prompt, + prompt=web_research_query, urls=[], country=None, language=userInput.userLanguage or "de", @@ -510,7 +728,7 @@ async def _processChatbotMessage( webResearchResults = json.dumps(researchResult, ensure_ascii=False, indent=2) if isinstance(researchResult, dict) else str(researchResult) await _emit_log_and_event(interfaceDbChat, workflowId, event_manager, "Internet-Recherche abgeschlossen") except Exception as e: - logger.error(f"Web research failed: {e}") + logger.error(f"Web research failed: {e}", exc_info=True) webResearchResults = f"Web research error: {str(e)}" await _emit_log_and_event(interfaceDbChat, workflowId, event_manager, "Internet-Recherche fehlgeschlagen", log_type="warning") diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index a07aa441..6341c895 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -383,15 +383,36 @@ Respond with ONLY a JSON object in this exact format: operationType=opType.value ) - self.services.chat.storeWorkflowStat( - self.services.workflow, - response, - f"ai.{opType.name.lower()}" - ) + # Try to store workflow stats, but don't fail if workflow is None (e.g., in chatbot context) + try: + self.services.chat.storeWorkflowStat( + self.services.workflow, + response, + f"ai.{opType.name.lower()}" + ) + except Exception as e: + # Log but don't fail - workflow might be None in some contexts (e.g., chatbot) + logger.debug(f"Could not store workflow stat (workflow may be None): {str(e)}") self.services.chat.progressLogUpdate(aiOperationId, 0.9, f"{opType.name} completed") self.services.chat.progressLogFinish(aiOperationId, True) + # Preserve metadata from response if available (e.g., results_with_content from Tavily) + # Check if response has metadata attribute (AiCallResponse from callAi) + if hasattr(response, 'metadata') and response.metadata: + # If metadata is a dict, store it in additionalData + if isinstance(response.metadata, dict): + if not metadata.additionalData: + metadata.additionalData = {} + metadata.additionalData.update(response.metadata) + # If metadata is an object with attributes, extract them + elif hasattr(response.metadata, '__dict__'): + if not metadata.additionalData: + metadata.additionalData = {} + for key, value in response.metadata.__dict__.items(): + if not key.startswith('_'): + metadata.additionalData[key] = value + return AiResponse( content=response.content, metadata=metadata diff --git a/modules/services/serviceWeb/mainServiceWeb.py b/modules/services/serviceWeb/mainServiceWeb.py index 469ca6ae..b6b90374 100644 --- a/modules/services/serviceWeb/mainServiceWeb.py +++ b/modules/services/serviceWeb/mainServiceWeb.py @@ -85,22 +85,119 @@ class WebService: if extractedUrls: allUrls.extend(extractedUrls) - # Step 2: Search for URLs if needed (based on needsSearch flag) + # Step 2: Search for URLs and content if needed (based on needsSearch flag) + searchUrls = [] + searchResultsWithContent = [] if needsSearch and (not allUrls or len(allUrls) < maxNumberPages): - self.services.chat.progressLogUpdate(operationId, 0.3, "Searching for URLs") + self.services.chat.progressLogUpdate(operationId, 0.3, "Searching for URLs and content") - searchUrls = await self._performWebSearch( - instruction=instruction, - maxNumberPages=maxNumberPages - len(allUrls), - country=countryCode, - language=languageCode - ) + try: + searchUrls, searchResultsWithContent = await self._performWebSearch( + instruction=instruction, + maxNumberPages=maxNumberPages - len(allUrls), + country=countryCode, + language=languageCode + ) + logger.info(f"Tavily search returned {len(searchUrls)} URLs with {len(searchResultsWithContent)} results containing content") + except Exception as e: + logger.error(f"Error performing Tavily search (continuing with other URLs): {str(e)}", exc_info=True) + searchUrls = [] + searchResultsWithContent = [] - # Add search URLs to the list - allUrls.extend(searchUrls) + # Prioritize Tavily search URLs over AI-extracted URLs (they're more relevant) + if searchUrls: + # Prepend Tavily URLs to the list (they're more relevant) + allUrls = searchUrls + allUrls + logger.info(f"Using {len(searchUrls)} Tavily URLs + {len(allUrls) - len(searchUrls)} other URLs = {len(allUrls)} total") + else: + # If Tavily search failed, use AI-extracted URLs + logger.warning("Tavily search returned no URLs, using AI-extracted URLs only") self.services.chat.progressLogUpdate(operationId, 0.5, f"Found {len(allUrls)} total URLs") + # If we have search results (even without content), use them directly instead of crawling + # Tavily search results are more relevant than generic AI-extracted URLs + # Only crawl if we have NO search results at all + if searchResultsWithContent and len(searchResultsWithContent) > 0: + urls_with_actual_content = sum(1 for r in searchResultsWithContent if r.get("content") and len(r.get("content", "")) > 0) + logger.info(f"Using {len(searchResultsWithContent)} Tavily search results ({urls_with_actual_content} with content) directly (skipping crawl)") + + # Convert search results to crawl result format + crawlResult = [] + for result in searchResultsWithContent: + crawlResult.append({ + "url": result["url"], + "title": result.get("title", ""), + "content": result.get("content", "") + }) + + # Calculate statistics + totalResults = len(crawlResult) + totalContentLength = sum(len(r.get("content", "")) for r in crawlResult) + urlsWithContent = sum(1 for r in crawlResult if r.get("content") and len(r.get("content", "")) > 0) + + # Log content availability + if urlsWithContent == 0: + logger.warning(f"Tavily search returned {len(searchResultsWithContent)} results but none have content - URLs will be used but may need crawling") + else: + logger.info(f"Tavily search provided content for {urlsWithContent}/{len(searchResultsWithContent)} URLs") + + # Even if content is empty, use these results - they're more relevant than generic URLs + # The final answer generation can work with URLs even if content is empty + + # Convert to sections format + sections = [] + for idx, item in enumerate(crawlResult): + section = { + "id": f"result_{idx}", + "content_type": "paragraph", + "title": item.get("title") or item.get("url", f"Result {idx + 1}"), + "order": idx + } + content = item.get("content", "") + if content: + section["textPreview"] = content[:200] + ("..." if len(content) > 200 else "") + sections.append(section) + + # Return consolidated result + result = { + "metadata": { + "title": suggestedFilename or instruction[:100] if instruction else "Web Research Results", + "extraction_method": "tavily_search_direct", + "research_depth": finalResearchDepth, + "country": countryCode, + "language": languageCode, + "urls_searched": searchUrls[:20], + "total_urls": len(searchUrls), + "urls_with_content": urlsWithContent, + "total_content_length": totalContentLength, + "search_date": self.services.utils.timestampGetUtc() if hasattr(self.services, 'utils') else None + }, + "sections": sections, + "statistics": { + "sectionCount": len(sections), + "total_urls": len(searchUrls), + "results_count": totalResults, + "urls_with_content": urlsWithContent, + "total_content_length": totalContentLength + }, + "instruction": instruction, + "urls_crawled": searchUrls, + "total_urls": len(searchUrls), + "results": crawlResult, + "total_results": totalResults + } + + if suggestedFilename: + result["suggested_filename"] = suggestedFilename + result["metadata"]["suggested_filename"] = suggestedFilename + + if operationId: + self.services.chat.progressLogUpdate(operationId, 0.9, "Completed") + self.services.chat.progressLogFinish(operationId, True) + + return result + # Step 3: Validate and filter URLs before crawling validatedUrls = self._validateUrls(allUrls) if not validatedUrls: @@ -322,8 +419,16 @@ Return ONLY valid JSON, no additional text: maxNumberPages: int, country: Optional[str], language: Optional[str] - ) -> List[str]: - """Perform web search to find URLs.""" + ) -> tuple[List[str], List[Dict[str, Any]]]: + """ + Perform web search to find URLs and content. + + Returns: + Tuple of (urls, search_results_with_content) + - urls: List of URL strings + - search_results_with_content: List of dicts with url, title, content from Tavily search + """ + search_results_with_content = [] try: # Build search prompt model searchPromptModel = AiCallPromptWebSearch( @@ -350,46 +455,224 @@ Return ONLY valid JSON, no additional text: outputFormat="json" ) + # Check if metadata contains results with content (from Tavily) + if hasattr(searchResponse, 'metadata') and searchResponse.metadata: + # Check in additionalData first (where we store custom metadata) + additional_data = None + if hasattr(searchResponse.metadata, 'additionalData') and searchResponse.metadata.additionalData: + additional_data = searchResponse.metadata.additionalData + elif isinstance(searchResponse.metadata, dict): + additional_data = searchResponse.metadata.get("additionalData", {}) + + if additional_data: + results_with_content = additional_data.get("results_with_content", []) + if results_with_content: + logger.info(f"Found {len(results_with_content)} search results with content in metadata.additionalData") + # Extract URLs and content from metadata + for result in results_with_content: + if result.get("url"): + search_results_with_content.append({ + "url": result.get("url"), + "title": result.get("title", ""), + "content": result.get("content", ""), + "score": result.get("score", 0) + }) + + # Also check directly in metadata (fallback) + if not search_results_with_content: + results_with_content = None + if hasattr(searchResponse.metadata, 'results_with_content'): + results_with_content = searchResponse.metadata.results_with_content + elif isinstance(searchResponse.metadata, dict): + results_with_content = searchResponse.metadata.get("results_with_content", []) + + if results_with_content: + logger.info(f"Found {len(results_with_content)} search results with content in metadata (direct)") + for result in results_with_content: + if result.get("url"): + search_results_with_content.append({ + "url": result.get("url"), + "title": result.get("title", ""), + "content": result.get("content", ""), + "score": result.get("score", 0) + }) + # Extract content from AiResponse searchResult = searchResponse.content + logger.debug(f"Search response content type: {type(searchResult)}, length: {len(str(searchResult)) if searchResult else 0}") + # Debug: persist search response if isinstance(searchResult, str): self.services.utils.writeDebugFile(searchResult, "websearch_response") + logger.debug(f"Search response (first 500 chars): {searchResult[:500]}") else: self.services.utils.writeDebugFile(json.dumps(searchResult, indent=2), "websearch_response") + logger.debug(f"Search response type: {type(searchResult)}, keys: {list(searchResult.keys()) if isinstance(searchResult, dict) else 'N/A'}") - # Parse and extract URLs + # Parse and extract URLs and content if isinstance(searchResult, str): # Extract JSON from response (handles markdown code blocks) extractedJson = self.services.utils.jsonExtractString(searchResult) - searchData = json.loads(extractedJson) if extractedJson else json.loads(searchResult) + if extractedJson: + try: + searchData = json.loads(extractedJson) + logger.debug(f"Parsed JSON from extracted string, type: {type(searchData)}") + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse extracted JSON: {e}, trying direct parse") + searchData = json.loads(searchResult) + else: + try: + searchData = json.loads(searchResult) + logger.debug(f"Parsed JSON directly from string, type: {type(searchData)}") + except json.JSONDecodeError as e: + logger.error(f"Failed to parse search result as JSON: {e}") + searchData = {} else: searchData = searchResult + logger.debug(f"Using searchResult directly as searchData, type: {type(searchData)}") - # Extract URLs from response + logger.debug(f"Final searchData type: {type(searchData)}, is dict: {isinstance(searchData, dict)}, keys: {list(searchData.keys()) if isinstance(searchData, dict) else 'N/A'}") + + # Extract URLs and content from response urls = [] if isinstance(searchData, dict): - if "urls" in searchData: + # Check for new format: {"urls": [...], "results": [...]} + if "urls" in searchData and "results" in searchData: urls = searchData["urls"] + # Extract results with content + for r in searchData["results"]: + if r.get("url"): + # Only add if not already added from metadata + if not any(sr["url"] == r.get("url") for sr in search_results_with_content): + search_results_with_content.append({ + "url": r.get("url"), + "title": r.get("title", ""), + "content": r.get("content", ""), + "score": r.get("score", 0) + }) + logger.debug(f"Extracted {len(urls)} URLs and {len(search_results_with_content)} results with content from new format") + elif "urls" in searchData: + urls = searchData["urls"] + logger.debug(f"Extracted {len(urls)} URLs from 'urls' field") elif "results" in searchData: - urls = [r.get("url") for r in searchData["results"] if r.get("url")] + # Extract URLs from results (content already in search_results_with_content if from metadata) + for r in searchData["results"]: + if r.get("url"): + urls.append(r.get("url")) + # Only add to search_results_with_content if not already added from metadata + if not any(sr["url"] == r.get("url") for sr in search_results_with_content): + search_results_with_content.append({ + "url": r.get("url"), + "title": r.get("title", ""), + "content": r.get("raw_content") or r.get("content", ""), + "score": r.get("score", 0) + }) + logger.debug(f"Extracted {len(urls)} URLs with content from 'results' field") + else: + logger.warning(f"Unexpected search data structure (dict): {list(searchData.keys())}") elif isinstance(searchData, list): # Handle both cases: list of URL strings or list of dicts with "url" key for item in searchData: if isinstance(item, str): # Item is already a URL string urls.append(item) - elif isinstance(item, dict) and item.get("url"): - # Item is a dict with "url" key - urls.append(item.get("url")) + elif isinstance(item, dict): + if item.get("url"): + urls.append(item.get("url")) + # Only add to search_results_with_content if not already added from metadata + if not any(sr["url"] == item.get("url") for sr in search_results_with_content): + search_results_with_content.append({ + "url": item.get("url"), + "title": item.get("title", ""), + "content": item.get("raw_content") or item.get("content", ""), + "score": item.get("score", 0) + }) + logger.debug(f"Extracted {len(urls)} URLs from list") + else: + logger.warning(f"Unexpected search data type: {type(searchData)}") - logger.info(f"Web search returned {len(urls)} URLs") - return urls + # If we got URLs but no content from metadata, extract URLs from search_results_with_content + if urls and not search_results_with_content: + # URLs were extracted but no content - create entries with empty content + for url in urls: + search_results_with_content.append({ + "url": url, + "title": "", + "content": "", + "score": 0 + }) + elif search_results_with_content and not urls: + # We have content but no URLs - extract URLs from content results + urls = [r["url"] for r in search_results_with_content] + + # If we have URLs but no search_results_with_content, create entries from URLs + # This ensures we can use Tavily URLs even if content extraction failed + if urls and not search_results_with_content: + logger.warning("Got URLs from Tavily search but no content extracted - creating entries for direct use") + for url in urls: + search_results_with_content.append({ + "url": url, + "title": "", + "content": "", # Empty content - will need crawling if used + "score": 0 + }) + + logger.info(f"Web search returned {len(urls)} URLs with {len(search_results_with_content)} results") + if search_results_with_content: + content_count = sum(1 for r in search_results_with_content if r.get("content") and len(r.get("content", "")) > 0) + logger.info(f" - {content_count} results have content, {len(search_results_with_content) - content_count} without content") + if content_count > 0: + first_with_content = next((r for r in search_results_with_content if r.get("content")), None) + if first_with_content: + logger.info(f"Content preview from first result with content: {first_with_content.get('content', '')[:200]}") + else: + logger.warning("No search results extracted - will need to crawl URLs") + return urls, search_results_with_content except Exception as e: - logger.error(f"Error in web search: {str(e)}") - return [] + logger.error(f"Error in web search: {str(e)}", exc_info=True) + # Even if there's an error, try to extract URLs from the response if available + recovered_urls = [] + recovered_results = [] + try: + if 'searchResponse' in locals() and searchResponse: + logger.info(f"Attempting to extract URLs from error response: {type(searchResponse)}") + # Try to get content from response + if hasattr(searchResponse, 'content'): + errorContent = searchResponse.content + if isinstance(errorContent, str): + # Try to parse as JSON + try: + errorData = json.loads(errorContent) + if isinstance(errorData, dict): + if "urls" in errorData: + recovered_urls = errorData["urls"] + elif "results" in errorData: + recovered_urls = [r.get("url") for r in errorData["results"] if r.get("url")] + recovered_results = [{"url": r.get("url"), "title": r.get("title", ""), "content": r.get("content", ""), "score": 0} for r in errorData["results"]] + elif isinstance(errorData, list): + recovered_urls = [item if isinstance(item, str) else item.get("url", "") for item in errorData if item] + if recovered_urls: + logger.info(f"Recovered {len(recovered_urls)} URLs from error response") + # Create entries for recovered URLs + if not recovered_results: + for url in recovered_urls: + recovered_results.append({"url": url, "title": "", "content": "", "score": 0}) + return recovered_urls, recovered_results + except Exception as parseError: + logger.debug(f"Failed to parse error response: {parseError}") + except Exception as recoverError: + logger.debug(f"Failed to recover URLs from error: {recoverError}") + + # If we have URLs from earlier extraction, return them + if 'urls' in locals() and urls: + logger.info(f"Returning {len(urls)} URLs extracted before error occurred") + # Create entries from URLs + results_from_urls = [{"url": url, "title": "", "content": "", "score": 0} for url in urls] + return urls, results_from_urls + + return [], [] def _validateUrls(self, urls: List[str]) -> List[str]: """