fix:fixed merge conflicts from chatbot
This commit is contained in:
commit
b06375352e
10 changed files with 2583 additions and 652 deletions
314
docs/STREAMING_UTILITY_ARCHITECTURE.md
Normal file
314
docs/STREAMING_UTILITY_ARCHITECTURE.md
Normal file
|
|
@ -0,0 +1,314 @@
|
|||
# Streaming Utility Architecture: Transforming eventManager into a Shared Utility
|
||||
|
||||
## Current State Analysis
|
||||
|
||||
### Existing Implementation
|
||||
The `eventManager.py` in `modules/features/chatbot/` currently provides:
|
||||
- **Event Queue Management**: Per-workflow asyncio queues for SSE streaming
|
||||
- **Event Emission**: `emit_event()` method for chatbot-specific events
|
||||
- **Cleanup**: Automatic queue cleanup after workflow completion
|
||||
- **SSE Streaming**: Used in `/api/chatbot/start/stream` endpoint
|
||||
|
||||
### Current Limitations
|
||||
|
||||
1. **Chatbot-Specific**: Hardcoded for chatbot workflows only
|
||||
2. **Polling Still Required**: Frontend still polls `getUnifiedChatData()` every 0.5 seconds even with SSE
|
||||
3. **Not Reusable**: Other features (workflows, document generation, etc.) can't use it
|
||||
4. **Mixed Approach**: SSE endpoint still internally polls database instead of pure event-driven streaming
|
||||
|
||||
### Frontend Polling Pattern
|
||||
Currently, the frontend uses:
|
||||
- `useWorkflowPolling.ts` - Polls `/api/workflow/{id}/chatData` every few seconds
|
||||
- `useWorkflowLifecycle.ts` - Manages polling lifecycle and state
|
||||
- Rate limit handling and backoff logic for failed polls
|
||||
|
||||
## Proposed Architecture: Shared Streaming Utility
|
||||
|
||||
### 1. Generic Event Manager (`modules/shared/streamingManager.py`)
|
||||
|
||||
```python
|
||||
class StreamingEventManager:
|
||||
"""
|
||||
Generic event manager for real-time streaming across all features.
|
||||
Supports multiple event types and contexts (workflows, documents, tasks, etc.)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._queues: Dict[str, asyncio.Queue] = {}
|
||||
self._locks: Dict[str, asyncio.Lock] = {}
|
||||
self._cleanup_tasks: Dict[str, asyncio.Task] = {}
|
||||
self._subscribers: Dict[str, Set[str]] = {} # context_id -> set of queue_ids
|
||||
|
||||
async def emit_event(
|
||||
self,
|
||||
context_id: str, # workflow_id, document_id, task_id, etc.
|
||||
event_type: str, # "message", "log", "status", "progress", "complete", "error"
|
||||
data: Dict[str, Any], # Flexible data structure
|
||||
event_category: str = "default" # "chat", "workflow", "document", etc.
|
||||
):
|
||||
"""Emit event to all subscribers of a context"""
|
||||
|
||||
def create_stream(
|
||||
self,
|
||||
context_id: str,
|
||||
event_categories: Optional[List[str]] = None # Filter by category
|
||||
) -> asyncio.Queue:
|
||||
"""Create a new stream for a context"""
|
||||
|
||||
async def stream_events(
|
||||
self,
|
||||
context_id: str,
|
||||
event_categories: Optional[List[str]] = None
|
||||
) -> AsyncIterator[Dict[str, Any]]:
|
||||
"""Async generator for streaming events"""
|
||||
```
|
||||
|
||||
### 2. Generic SSE Route Helper (`modules/shared/sseUtils.py`)
|
||||
|
||||
```python
|
||||
def create_sse_stream(
|
||||
context_id: str,
|
||||
event_categories: Optional[List[str]] = None,
|
||||
initial_data_callback: Optional[Callable] = None,
|
||||
timeout: float = 300.0
|
||||
) -> StreamingResponse:
|
||||
"""
|
||||
Create a generic SSE streaming response.
|
||||
|
||||
Args:
|
||||
context_id: Workflow ID, document ID, or other context identifier
|
||||
event_categories: Filter events by category (e.g., ["chat", "workflow"])
|
||||
initial_data_callback: Optional function to fetch initial state
|
||||
timeout: Stream timeout in seconds
|
||||
"""
|
||||
streaming_manager = get_streaming_manager()
|
||||
|
||||
async def event_stream():
|
||||
# Send initial data if callback provided
|
||||
if initial_data_callback:
|
||||
initial_data = await initial_data_callback(context_id)
|
||||
yield format_sse_event("initial", initial_data)
|
||||
|
||||
# Stream events from manager
|
||||
async for event in streaming_manager.stream_events(context_id, event_categories):
|
||||
yield format_sse_event(event["type"], event["data"])
|
||||
|
||||
return StreamingResponse(
|
||||
event_stream(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no"
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Integration Points
|
||||
|
||||
#### A. Workflow Processing
|
||||
```python
|
||||
# In workflow processing code
|
||||
from modules.shared.streamingManager import get_streaming_manager
|
||||
|
||||
streaming_manager = get_streaming_manager()
|
||||
|
||||
# Emit progress updates
|
||||
await streaming_manager.emit_event(
|
||||
context_id=workflow_id,
|
||||
event_type="progress",
|
||||
data={"step": "analyzing", "message": "Processing documents..."},
|
||||
event_category="workflow"
|
||||
)
|
||||
|
||||
# Emit new messages
|
||||
await streaming_manager.emit_event(
|
||||
context_id=workflow_id,
|
||||
event_type="message",
|
||||
data={"role": "assistant", "content": "Response text"},
|
||||
event_category="chat"
|
||||
)
|
||||
```
|
||||
|
||||
#### B. Route Endpoints
|
||||
```python
|
||||
# Generic streaming endpoint for any context
|
||||
@router.get("/{contextId}/stream")
|
||||
async def stream_context_updates(
|
||||
contextId: str,
|
||||
categories: Optional[str] = Query(None), # Comma-separated categories
|
||||
currentUser: User = Depends(getCurrentUser)
|
||||
):
|
||||
event_categories = categories.split(",") if categories else None
|
||||
|
||||
# Optional: Fetch initial state
|
||||
async def get_initial_data(ctx_id: str):
|
||||
interfaceDbChat = getServiceChat(currentUser)
|
||||
return interfaceDbChat.getUnifiedChatData(ctx_id, None)
|
||||
|
||||
return create_sse_stream(
|
||||
context_id=contextId,
|
||||
event_categories=event_categories,
|
||||
initial_data_callback=get_initial_data
|
||||
)
|
||||
```
|
||||
|
||||
## Benefits of Streaming vs Polling
|
||||
|
||||
### Performance
|
||||
- **Reduced Server Load**: No constant database queries every 0.5-3 seconds
|
||||
- **Lower Latency**: Events delivered immediately (< 100ms) vs polling delay (500-3000ms)
|
||||
- **Bandwidth Efficiency**: Only send data when it changes, not empty responses
|
||||
|
||||
### User Experience
|
||||
- **Real-time Updates**: Users see progress instantly
|
||||
- **Better Responsiveness**: No perceived delay from polling intervals
|
||||
- **Reduced Battery**: Mobile devices consume less power without constant polling
|
||||
|
||||
### Scalability
|
||||
- **Horizontal Scaling**: Event queues can be distributed (Redis, RabbitMQ)
|
||||
- **Connection Management**: Better handling of many concurrent streams
|
||||
- **Resource Efficiency**: One persistent connection vs many HTTP requests
|
||||
|
||||
## Migration Strategy
|
||||
|
||||
### Phase 1: Create Shared Utility
|
||||
1. Move `eventManager.py` → `modules/shared/streamingManager.py`
|
||||
2. Generalize for any context type (not just workflows)
|
||||
3. Add event categorization and filtering
|
||||
4. Create `sseUtils.py` helper functions
|
||||
|
||||
### Phase 2: Update Chatbot Feature
|
||||
1. Update chatbot to use shared streaming manager
|
||||
2. Replace internal polling in SSE endpoint with pure event-driven streaming
|
||||
3. Emit events directly when data changes (in database write operations)
|
||||
|
||||
### Phase 3: Migrate Other Features
|
||||
1. **Workflows**: Add streaming to workflow processing
|
||||
2. **Document Generation**: Stream document creation progress
|
||||
3. **Data Processing**: Stream extraction/transformation progress
|
||||
4. **Any Long-Running Task**: Use streaming for status updates
|
||||
|
||||
### Phase 4: Frontend Migration
|
||||
1. Replace `useWorkflowPolling` with SSE EventSource connections
|
||||
2. Create generic `useStreaming` hook
|
||||
3. Update all components to use streaming instead of polling
|
||||
4. Remove polling logic entirely
|
||||
|
||||
## Implementation Details
|
||||
|
||||
### Event-Driven Data Emission
|
||||
|
||||
Instead of polling `getUnifiedChatData()`, emit events when data changes:
|
||||
|
||||
```python
|
||||
# In interfaceDbChatObjects.py - when creating a message
|
||||
def createMessage(self, workflowId: str, message: ChatMessage):
|
||||
# ... existing database write ...
|
||||
|
||||
# Emit streaming event
|
||||
from modules.shared.streamingManager import get_streaming_manager
|
||||
streaming_manager = get_streaming_manager()
|
||||
asyncio.create_task(streaming_manager.emit_event(
|
||||
context_id=workflowId,
|
||||
event_type="message",
|
||||
data={
|
||||
"type": "message",
|
||||
"createdAt": message.publishedAt,
|
||||
"item": message.dict()
|
||||
},
|
||||
event_category="chat"
|
||||
))
|
||||
```
|
||||
|
||||
### Frontend Integration
|
||||
|
||||
```typescript
|
||||
// Generic streaming hook
|
||||
function useStreaming<T>(
|
||||
contextId: string,
|
||||
categories?: string[],
|
||||
onEvent?: (event: T) => void
|
||||
) {
|
||||
useEffect(() => {
|
||||
const eventSource = new EventSource(
|
||||
`/api/stream/${contextId}?categories=${categories?.join(',')}`
|
||||
);
|
||||
|
||||
eventSource.onmessage = (e) => {
|
||||
const event = JSON.parse(e.data);
|
||||
onEvent?.(event);
|
||||
};
|
||||
|
||||
return () => eventSource.close();
|
||||
}, [contextId, categories]);
|
||||
}
|
||||
```
|
||||
|
||||
## Key Design Decisions
|
||||
|
||||
### 1. Context-Based Streaming
|
||||
- Use generic `context_id` instead of `workflow_id`
|
||||
- Supports workflows, documents, tasks, user sessions, etc.
|
||||
|
||||
### 2. Event Categories
|
||||
- Allow filtering by category (chat, workflow, document, etc.)
|
||||
- Enables multiple features to stream from same context
|
||||
|
||||
### 3. Backward Compatibility
|
||||
- Keep existing polling endpoints during migration
|
||||
- Gradually migrate features one at a time
|
||||
- Frontend can use both during transition
|
||||
|
||||
### 4. Error Handling
|
||||
- Graceful degradation if streaming unavailable
|
||||
- Automatic reconnection logic in frontend
|
||||
- Fallback to polling if SSE fails
|
||||
|
||||
## Example: Complete Flow
|
||||
|
||||
### Backend: Workflow Processing
|
||||
```python
|
||||
async def process_workflow(workflow_id: str):
|
||||
streaming = get_streaming_manager()
|
||||
|
||||
# Emit status update
|
||||
await streaming.emit_event(workflow_id, "status",
|
||||
{"status": "running"}, "workflow")
|
||||
|
||||
# Process and emit messages
|
||||
result = await ai_call(...)
|
||||
await streaming.emit_event(workflow_id, "message",
|
||||
{"role": "assistant", "content": result}, "chat")
|
||||
|
||||
# Emit completion
|
||||
await streaming.emit_event(workflow_id, "complete",
|
||||
{"status": "completed"}, "workflow")
|
||||
```
|
||||
|
||||
### Frontend: React Hook
|
||||
```typescript
|
||||
function ChatComponent({ workflowId }: { workflowId: string }) {
|
||||
const [messages, setMessages] = useState<Message[]>([]);
|
||||
|
||||
useStreaming(workflowId, ["chat"], (event) => {
|
||||
if (event.type === "message") {
|
||||
setMessages(prev => [...prev, event.item]);
|
||||
}
|
||||
});
|
||||
|
||||
return <MessageList messages={messages} />;
|
||||
}
|
||||
```
|
||||
|
||||
## Conclusion
|
||||
|
||||
By transforming `eventManager.py` into a shared streaming utility:
|
||||
|
||||
1. **Eliminates Polling**: All features can stream updates in real-time
|
||||
2. **Improves Performance**: Reduces server load and latency
|
||||
3. **Better UX**: Instant updates instead of polling delays
|
||||
4. **Reusable**: Any feature can use streaming with minimal code
|
||||
5. **Scalable**: Foundation for future real-time features
|
||||
|
||||
The migration can be done incrementally, feature by feature, without breaking existing functionality.
|
||||
134
docs/WEBSEARCH_FIXES.md
Normal file
134
docs/WEBSEARCH_FIXES.md
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
# Web Search Content Extraction Fixes
|
||||
|
||||
## Problem Summary
|
||||
|
||||
The Tavily web search integration was failing to extract content from search results, causing web research to return empty or incomplete data. The main issues were related to handling `None` values and incomplete error recovery.
|
||||
|
||||
## Main Issues Fixed
|
||||
|
||||
### 1. Incomplete Content Extraction from Search Results
|
||||
|
||||
**Problem:**
|
||||
- When Tavily API returned search results, some results had `raw_content` set to `None` (not missing, but explicitly `None`)
|
||||
- The code used `result.get("raw_content") or result.get("content", "")` which failed when `raw_content` existed but was `None`
|
||||
- This caused `None` values to propagate through the system instead of falling back to the `content` field or empty string
|
||||
|
||||
**Fix:**
|
||||
Changed the content extraction in `aicorePluginTavily.py` to properly handle `None` values:
|
||||
```python
|
||||
# Before (line 344):
|
||||
rawContent=result.get("raw_content") or result.get("content", "")
|
||||
|
||||
# After:
|
||||
rawContent=result.get("raw_content") or result.get("content") or ""
|
||||
```
|
||||
|
||||
This ensures that if `raw_content` is `None`, it falls back to `content`, and if that's also `None`, it defaults to an empty string.
|
||||
|
||||
**Additional Fix:**
|
||||
Added defensive checks in the `webSearch` method to safely extract content even when result objects have unexpected structures:
|
||||
```python
|
||||
# Safely extract content with multiple fallbacks
|
||||
content = ""
|
||||
if hasattr(result, 'rawContent'):
|
||||
content = result.rawContent or ""
|
||||
if not content and hasattr(result, 'content'):
|
||||
content = result.content or ""
|
||||
```
|
||||
|
||||
### 2. NoneType Error When Logging Content Length
|
||||
|
||||
**Problem:**
|
||||
- Code attempted to check `len(first_result.get('raw_content', ''))` for logging
|
||||
- When `raw_content` key existed but value was `None`, `.get()` returned `None` instead of the default `''`
|
||||
- This caused `len(None)` to fail with `TypeError: object of type 'NoneType' has no len()`
|
||||
|
||||
**Fix:**
|
||||
Changed the logging code to safely handle `None` values:
|
||||
```python
|
||||
# Before (line 338):
|
||||
logger.debug(f"First result has raw_content: {'raw_content' in first_result}, content length: {len(first_result.get('raw_content', ''))}")
|
||||
|
||||
# After:
|
||||
raw_content = first_result.get('raw_content') or ''
|
||||
logger.debug(f"First result has raw_content: {'raw_content' in first_result}, content length: {len(raw_content)}")
|
||||
```
|
||||
|
||||
### 3. Missing Error Recovery in Content Extraction
|
||||
|
||||
**Problem:**
|
||||
- When processing search results, if one result failed to extract, the entire extraction could fail
|
||||
- No recovery mechanism to extract at least URLs even when content extraction failed
|
||||
- Errors were logged but processing stopped, losing potentially useful data
|
||||
|
||||
**Fix:**
|
||||
Added per-result error handling with recovery:
|
||||
```python
|
||||
for result in searchResults:
|
||||
try:
|
||||
# Extract URL, content, title safely
|
||||
# ... extraction logic ...
|
||||
except Exception as resultError:
|
||||
logger.warning(f"Error processing individual search result: {resultError}")
|
||||
# Continue processing other results instead of failing completely
|
||||
continue
|
||||
```
|
||||
|
||||
Also added recovery at the extraction level:
|
||||
```python
|
||||
except Exception as extractionError:
|
||||
logger.error(f"Error extracting URLs and content from search results: {extractionError}")
|
||||
# Try to recover at least URLs
|
||||
try:
|
||||
urls = [result.url for result in searchResults if hasattr(result, 'url') and result.url]
|
||||
logger.info(f"Recovered {len(urls)} URLs after extraction error")
|
||||
except Exception:
|
||||
logger.error("Failed to recover any URLs from search results")
|
||||
```
|
||||
|
||||
### 4. Incomplete Crawl Result Processing
|
||||
|
||||
**Problem:**
|
||||
- When crawl returned results but individual page processing failed, entire crawl was lost
|
||||
- No fallback to extract at least URLs from failed crawl results
|
||||
- Missing content fields could cause errors when formatting results
|
||||
|
||||
**Fix:**
|
||||
Added error handling for individual page processing:
|
||||
```python
|
||||
for i, result in enumerate(crawlResults, 1):
|
||||
try:
|
||||
# Format page content
|
||||
# ... formatting logic ...
|
||||
except Exception as pageError:
|
||||
logger.warning(f"Error formatting page {i} from crawl: {pageError}")
|
||||
# Try to add at least the URL
|
||||
try:
|
||||
pageUrls.append(result.url if hasattr(result, 'url') and result.url else webCrawlPrompt.url)
|
||||
except Exception:
|
||||
pass
|
||||
```
|
||||
|
||||
Also ensured all result fields have safe defaults:
|
||||
```python
|
||||
results.append(WebCrawlResult(
|
||||
url=result_url or url, # Fallback to base URL
|
||||
content=result_content, # Already ensured to be string
|
||||
title=result_title # Already ensured to be string
|
||||
))
|
||||
```
|
||||
|
||||
## Impact
|
||||
|
||||
These fixes ensure that:
|
||||
1. **Content is always extracted** - Even when `raw_content` is `None`, the system falls back to `content` field or empty string
|
||||
2. **Partial results are preserved** - If some results fail, others are still processed and returned
|
||||
3. **URLs are recovered** - Even when content extraction fails completely, URLs can still be extracted for crawling
|
||||
4. **No crashes from None values** - All `None` values are properly handled before operations like `len()` are called
|
||||
|
||||
## Testing Recommendations
|
||||
|
||||
- Test with Tavily search results that have `raw_content` set to `None`
|
||||
- Test with mixed results (some with content, some without)
|
||||
- Test error recovery when individual results fail
|
||||
- Verify that URLs are still extracted even when content extraction fails
|
||||
|
|
@ -321,13 +321,28 @@ class AiTavily(BaseConnectorAi):
|
|||
|
||||
# Return all results without score filtering
|
||||
# Tavily's scoring is already applied by the API
|
||||
logger.info(f"Tavily returned {len(response.get('results', []))} results")
|
||||
results_count = len(response.get('results', []))
|
||||
logger.info(f"Tavily returned {results_count} results")
|
||||
|
||||
# Log content availability
|
||||
results_with_content = 0
|
||||
for result in response.get('results', []):
|
||||
if result.get("raw_content"):
|
||||
results_with_content += 1
|
||||
logger.info(f"Tavily results with raw_content: {results_with_content}/{results_count}")
|
||||
|
||||
# Log first result structure for debugging
|
||||
if response.get('results') and len(response['results']) > 0:
|
||||
first_result = response['results'][0]
|
||||
logger.debug(f"First result keys: {list(first_result.keys())}")
|
||||
raw_content = first_result.get('raw_content') or ''
|
||||
logger.debug(f"First result has raw_content: {'raw_content' in first_result}, content length: {len(raw_content)}")
|
||||
|
||||
return [
|
||||
WebSearchResult(
|
||||
title=result["title"],
|
||||
url=self._cleanUrl(result["url"]),
|
||||
rawContent=result.get("raw_content")
|
||||
title=result.get("title", ""),
|
||||
url=self._cleanUrl(result.get("url", "")),
|
||||
rawContent=result.get("raw_content") or result.get("content") or ""
|
||||
)
|
||||
for result in response["results"]
|
||||
]
|
||||
|
|
@ -381,24 +396,90 @@ class AiTavily(BaseConnectorAi):
|
|||
logger.debug(f"Tavily response received: {type(response)}")
|
||||
|
||||
# Parse response - could be dict with results or list
|
||||
if isinstance(response, dict) and "results" in response:
|
||||
pageResults = response["results"]
|
||||
if isinstance(response, dict):
|
||||
if "results" in response:
|
||||
pageResults = response["results"]
|
||||
logger.debug(f"Found 'results' key in response dict with {len(pageResults)} items")
|
||||
else:
|
||||
logger.warning(f"Response dict keys: {list(response.keys())}")
|
||||
# Check for other possible keys
|
||||
if "pages" in response:
|
||||
pageResults = response["pages"]
|
||||
logger.debug(f"Found 'pages' key with {len(pageResults)} items")
|
||||
elif "content" in response:
|
||||
# Single page result
|
||||
pageResults = [response]
|
||||
logger.debug("Found 'content' key, treating as single page result")
|
||||
else:
|
||||
logger.warning(f"Unexpected response dict structure: {list(response.keys())}")
|
||||
pageResults = []
|
||||
elif isinstance(response, list):
|
||||
pageResults = response
|
||||
logger.debug(f"Response is a list with {len(pageResults)} items")
|
||||
else:
|
||||
logger.warning(f"Unexpected response format: {type(response)}")
|
||||
logger.warning(f"Unexpected response format: {type(response)}, value: {str(response)[:200]}")
|
||||
pageResults = []
|
||||
|
||||
logger.debug(f"Got {len(pageResults)} pages from crawl")
|
||||
logger.info(f"Got {len(pageResults)} pages from crawl for URL: {url}")
|
||||
if len(pageResults) == 0:
|
||||
logger.warning(f"Tavily crawl returned 0 pages for URL: {url}. Response structure: {type(response)}")
|
||||
if isinstance(response, dict):
|
||||
logger.warning(f"Response keys: {list(response.keys())}")
|
||||
# Log all values to debug (not just first 3)
|
||||
for key, value in response.items():
|
||||
value_str = str(value)
|
||||
if len(value_str) > 200:
|
||||
value_str = value_str[:200] + "..."
|
||||
logger.warning(f" {key}: {type(value)} - {value_str}")
|
||||
|
||||
# Check for error messages in response
|
||||
if "error" in response:
|
||||
logger.error(f"Tavily API error in response: {response.get('error')}")
|
||||
if "message" in response:
|
||||
logger.warning(f"Tavily API message: {response.get('message')}")
|
||||
elif isinstance(response, str):
|
||||
logger.warning(f"Tavily returned string response (first 500 chars): {response[:500]}")
|
||||
else:
|
||||
logger.warning(f"Unexpected response type: {type(response)}, value: {str(response)[:500]}")
|
||||
|
||||
# Convert to WebCrawlResult format
|
||||
# Convert to WebCrawlResult format with error handling
|
||||
results = []
|
||||
for result in pageResults:
|
||||
results.append(WebCrawlResult(
|
||||
url=result.get("url", url),
|
||||
content=result.get("raw_content", result.get("content", "")),
|
||||
title=result.get("title", "")
|
||||
))
|
||||
for idx, result in enumerate(pageResults):
|
||||
try:
|
||||
# Safely extract fields
|
||||
result_url = result.get("url") if isinstance(result, dict) else (getattr(result, "url", None) if hasattr(result, "url") else url)
|
||||
result_content = ""
|
||||
if isinstance(result, dict):
|
||||
result_content = result.get("raw_content") or result.get("content") or ""
|
||||
elif hasattr(result, "raw_content"):
|
||||
result_content = result.raw_content or ""
|
||||
elif hasattr(result, "content"):
|
||||
result_content = result.content or ""
|
||||
|
||||
result_title = ""
|
||||
if isinstance(result, dict):
|
||||
result_title = result.get("title", "")
|
||||
elif hasattr(result, "title"):
|
||||
result_title = result.title or ""
|
||||
|
||||
results.append(WebCrawlResult(
|
||||
url=result_url or url,
|
||||
content=result_content,
|
||||
title=result_title
|
||||
))
|
||||
except Exception as resultError:
|
||||
logger.warning(f"Error processing crawl result {idx}: {resultError}")
|
||||
# Try to create a minimal result with at least the URL
|
||||
try:
|
||||
if isinstance(result, dict) and result.get("url"):
|
||||
results.append(WebCrawlResult(
|
||||
url=result.get("url", url),
|
||||
content="",
|
||||
title=""
|
||||
))
|
||||
except Exception:
|
||||
logger.error(f"Failed to create minimal result for crawl result {idx}")
|
||||
continue
|
||||
|
||||
logger.debug(f"Crawl successful: extracted {len(results)} pages from URL")
|
||||
return results
|
||||
|
|
@ -413,7 +494,7 @@ class AiTavily(BaseConnectorAi):
|
|||
|
||||
except Exception as e:
|
||||
logger.warning(f"Crawl attempt {attempt + 1} failed for URL {url}: {str(e)}")
|
||||
logger.debug(f"Full error details: {type(e).__name__}: {str(e)}")
|
||||
logger.debug(f"Full error details: {type(e).__name__}: {str(e)}", exc_info=True)
|
||||
|
||||
# Check if it's a validation error and log more details
|
||||
if "validation" in str(e).lower():
|
||||
|
|
@ -427,10 +508,22 @@ class AiTavily(BaseConnectorAi):
|
|||
if len(url) > 2000:
|
||||
logger.debug(f" WARNING: URL is very long ({len(url)} chars)")
|
||||
|
||||
# Log API-specific errors
|
||||
error_str = str(e).lower()
|
||||
if "rate limit" in error_str or "429" in error_str:
|
||||
logger.error(f"Tavily API rate limit hit for URL: {url}")
|
||||
elif "401" in error_str or "unauthorized" in error_str:
|
||||
logger.error(f"Tavily API authentication failed for URL: {url}")
|
||||
elif "404" in error_str or "not found" in error_str:
|
||||
logger.warning(f"URL not found (404) for: {url}")
|
||||
elif "timeout" in error_str:
|
||||
logger.warning(f"Timeout error for URL: {url}")
|
||||
|
||||
if attempt < maxRetries:
|
||||
logger.info(f"Retrying in {retryDelay} seconds...")
|
||||
await asyncio.sleep(retryDelay)
|
||||
else:
|
||||
logger.error(f"Crawl failed after {maxRetries + 1} attempts for URL: {url}")
|
||||
raise Exception(f"Crawl failed after {maxRetries + 1} attempts: {str(e)}")
|
||||
|
||||
async def _routeWebOperation(self, modelCall: AiModelCall) -> "AiModelResponse":
|
||||
|
|
@ -508,21 +601,84 @@ class AiTavily(BaseConnectorAi):
|
|||
includeRawContent="text"
|
||||
)
|
||||
|
||||
# Extract URLs from results
|
||||
urls = [result.url for result in searchResults]
|
||||
# Extract URLs and content from results with error handling
|
||||
urls = []
|
||||
results_with_content = []
|
||||
content_count = 0
|
||||
|
||||
# Return as JSON array
|
||||
try:
|
||||
for result in searchResults:
|
||||
try:
|
||||
# Safely extract URL
|
||||
url = result.url if hasattr(result, 'url') and result.url else ""
|
||||
if url:
|
||||
urls.append(url)
|
||||
|
||||
# Safely extract content
|
||||
content = ""
|
||||
if hasattr(result, 'rawContent'):
|
||||
content = result.rawContent or ""
|
||||
if not content and hasattr(result, 'content'):
|
||||
content = result.content or ""
|
||||
|
||||
if content:
|
||||
content_count += 1
|
||||
|
||||
# Safely extract title
|
||||
title = result.title if hasattr(result, 'title') and result.title else ""
|
||||
|
||||
results_with_content.append({
|
||||
"url": url,
|
||||
"title": title,
|
||||
"content": content,
|
||||
"score": getattr(result, 'score', 0)
|
||||
})
|
||||
except Exception as resultError:
|
||||
logger.warning(f"Error processing individual search result: {resultError}")
|
||||
# Continue processing other results
|
||||
continue
|
||||
|
||||
logger.info(f"Tavily search: {len(urls)} URLs, {content_count} with content, {len(results_with_content)} total results")
|
||||
if content_count == 0:
|
||||
logger.warning("Tavily search returned no content - results may need crawling")
|
||||
except Exception as extractionError:
|
||||
logger.error(f"Error extracting URLs and content from search results: {extractionError}")
|
||||
# Try to recover at least URLs
|
||||
try:
|
||||
urls = [result.url for result in searchResults if hasattr(result, 'url') and result.url]
|
||||
logger.info(f"Recovered {len(urls)} URLs after extraction error")
|
||||
except Exception:
|
||||
logger.error("Failed to recover any URLs from search results")
|
||||
|
||||
# Return both URLs and full results in JSON for direct extraction
|
||||
# Format: {"urls": [...], "results": [...]}
|
||||
import json
|
||||
response_data = {
|
||||
"urls": urls,
|
||||
"results": results_with_content
|
||||
}
|
||||
|
||||
return AiModelResponse(
|
||||
content=json.dumps(urls, indent=2),
|
||||
content=json.dumps(response_data, indent=2),
|
||||
success=True,
|
||||
metadata={"total_urls": len(urls), "operation": "WEB_SEARCH_DATA"}
|
||||
metadata={
|
||||
"total_urls": len(urls),
|
||||
"operation": "WEB_SEARCH_DATA",
|
||||
"results_with_content": results_with_content # Also in metadata for compatibility
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Tavily web search: {str(e)}")
|
||||
logger.error(f"Error in Tavily web search: {str(e)}", exc_info=True)
|
||||
import json
|
||||
# Return error response with empty results
|
||||
error_response = {
|
||||
"urls": [],
|
||||
"results": [],
|
||||
"error": str(e)
|
||||
}
|
||||
return AiModelResponse(
|
||||
content="[]",
|
||||
content=json.dumps(error_response, indent=2),
|
||||
success=False,
|
||||
error=str(e)
|
||||
)
|
||||
|
|
@ -575,23 +731,44 @@ class AiTavily(BaseConnectorAi):
|
|||
# If we got multiple pages from the crawl, we need to format them differently
|
||||
# Return the first result for backwards compatibility, but include total page count
|
||||
if crawlResults and len(crawlResults) > 0:
|
||||
# Get all pages content
|
||||
# Get all pages content with error handling
|
||||
allContent = ""
|
||||
pageUrls = []
|
||||
for i, result in enumerate(crawlResults, 1):
|
||||
pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n"
|
||||
if result.title:
|
||||
allContent += f"{pageHeader}Title: {result.title}\n\n"
|
||||
allContent += f"{result.content}\n"
|
||||
try:
|
||||
pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n"
|
||||
if result.title:
|
||||
allContent += f"{pageHeader}Title: {result.title}\n\n"
|
||||
else:
|
||||
allContent += f"{pageHeader}\n"
|
||||
allContent += f"{result.content or ''}\n"
|
||||
pageUrls.append(result.url)
|
||||
except Exception as pageError:
|
||||
logger.warning(f"Error formatting page {i} from crawl: {pageError}")
|
||||
# Try to add at least the URL
|
||||
try:
|
||||
pageUrls.append(result.url if hasattr(result, 'url') and result.url else webCrawlPrompt.url)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
resultData = {
|
||||
"url": webCrawlPrompt.url,
|
||||
"title": crawlResults[0].title if crawlResults[0].title else "Content",
|
||||
"title": crawlResults[0].title if crawlResults and crawlResults[0].title else "Content",
|
||||
"content": allContent,
|
||||
"pagesCrawled": len(crawlResults),
|
||||
"pageUrls": [result.url for result in crawlResults]
|
||||
"pageUrls": pageUrls
|
||||
}
|
||||
logger.info(f"Crawl successful: {len(crawlResults)} pages extracted from {webCrawlPrompt.url}")
|
||||
else:
|
||||
resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted", "pagesCrawled": 0}
|
||||
logger.warning(f"Crawl returned no results for URL: {webCrawlPrompt.url}")
|
||||
resultData = {
|
||||
"url": webCrawlPrompt.url,
|
||||
"title": "",
|
||||
"content": "",
|
||||
"error": "No content extracted - Tavily crawl returned 0 pages",
|
||||
"pagesCrawled": 0,
|
||||
"pageUrls": []
|
||||
}
|
||||
|
||||
# Return as JSON - same format as Perplexity but with multiple pages content
|
||||
import json
|
||||
|
|
@ -602,9 +779,17 @@ class AiTavily(BaseConnectorAi):
|
|||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Tavily web crawl: {str(e)}")
|
||||
logger.error(f"Error in Tavily web crawl: {str(e)}", exc_info=True)
|
||||
import json
|
||||
errorResult = {"error": str(e), "url": webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""}
|
||||
crawl_url = webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""
|
||||
errorResult = {
|
||||
"url": crawl_url,
|
||||
"title": "",
|
||||
"content": "",
|
||||
"error": str(e),
|
||||
"pagesCrawled": 0,
|
||||
"pageUrls": []
|
||||
}
|
||||
return AiModelResponse(
|
||||
content=json.dumps(errorResult, indent=2),
|
||||
success=False,
|
||||
|
|
|
|||
731
modules/features/chatbot/chatbotConstants.py
Normal file
731
modules/features/chatbot/chatbotConstants.py
Normal file
|
|
@ -0,0 +1,731 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Constants and utility functions for the chatbot module.
|
||||
Contains system prompts and conversation name generation.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import datetime
|
||||
from typing import Optional, List
|
||||
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, ProcessingModeEnum
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_analysis_system_prompt() -> str:
|
||||
"""
|
||||
Get the system prompt for analyzing user input and creating queries.
|
||||
Focuses on understanding the question and determining what queries are needed.
|
||||
"""
|
||||
current_date = datetime.datetime.now().strftime("%d.%m.%Y")
|
||||
|
||||
return f"""Heute ist der {current_date}.
|
||||
|
||||
Du bist ein Chatbot der Althaus AG.
|
||||
Deine Aufgabe ist es, Benutzeranfragen zu analysieren und zu bestimmen, welche Datenbankabfragen oder Web-Recherchen benötigt werden, um die Frage zu beantworten.
|
||||
|
||||
DATENBANK-INFORMATIONEN:
|
||||
- Datenbankdatei: /data/database.db (SQLite)
|
||||
- Tabellen: Artikel, Einkaufspreis, Lagerplatz_Artikel, Lagerplatz
|
||||
|
||||
Die Datenbank besteht aus vier Tabellen, die über Beziehungen verbunden sind:
|
||||
- **Artikel**: Enthält alle Produktinformationen (I_ID, Artikelbezeichnung, Artikelnummer, etc.)
|
||||
- **Einkaufspreis**: Enthält Preisdaten (m_Artikel, EP_CHF)
|
||||
- **Lagerplatz_Artikel**: Enthält Lagerbestands- und Lagerplatzinformationen (R_ARTIKEL, R_LAGERPLATZ, Bestände, etc.)
|
||||
- **Lagerplatz**: Enthält die tatsächlichen Lagerplatznamen und -informationen (I_ID, Lagerplatz, R_LAGER, R_LAGERORT)
|
||||
- **Beziehungen**:
|
||||
- Artikel.I_ID = Einkaufspreis.m_Artikel
|
||||
- Artikel.I_ID = Lagerplatz_Artikel.R_ARTIKEL
|
||||
- Lagerplatz_Artikel.R_LAGERPLATZ = Lagerplatz.I_ID (WICHTIG: R_LAGERPLATZ enthält die ID, nicht den Namen!)
|
||||
|
||||
TABELLEN-SCHEMA (WICHTIG - Spalten mit Leerzeichen/Sonderzeichen IMMER in doppelte Anführungszeichen setzen):
|
||||
|
||||
Tabelle 1: Artikel
|
||||
CREATE TABLE Artikel (
|
||||
"I_ID" INTEGER PRIMARY KEY,
|
||||
"Artikelbeschrieb" TEXT,
|
||||
"Artikelbezeichnung" TEXT,
|
||||
"Artikelgruppe" TEXT,
|
||||
"Artikelkategorie" TEXT,
|
||||
"Artikelkürzel" TEXT,
|
||||
"Artikelnummer" TEXT,
|
||||
"Einheit" TEXT,
|
||||
"Gesperrt" TEXT,
|
||||
"Keywords" TEXT,
|
||||
"Lieferant" TEXT,
|
||||
"Warengruppe" TEXT
|
||||
)
|
||||
|
||||
Tabelle 2: Einkaufspreis
|
||||
CREATE TABLE Einkaufspreis (
|
||||
"m_Artikel" INTEGER,
|
||||
"EP_CHF" FLOAT
|
||||
)
|
||||
|
||||
Tabelle 3: Lagerplatz_Artikel
|
||||
CREATE TABLE Lagerplatz_Artikel (
|
||||
"R_ARTIKEL" INTEGER,
|
||||
"R_LAGERPLATZ" TEXT,
|
||||
"S_BESTELLTER__BESTAND" INTEGER,
|
||||
"S_IST_BESTAND" TEXT,
|
||||
"S_MAXIMALBESTAND" INTEGER,
|
||||
"S_MINDESTBESTAND" INTEGER,
|
||||
"S_RESERVIERTER__BESTAND" INTEGER,
|
||||
"S_SOLL_BESTAND" INTEGER
|
||||
)
|
||||
|
||||
Tabelle 4: Lagerplatz
|
||||
CREATE TABLE Lagerplatz (
|
||||
"I_ID" INTEGER PRIMARY KEY,
|
||||
"Lagerplatz" TEXT,
|
||||
"R_LAGER" TEXT,
|
||||
"R_LAGERORT" TEXT
|
||||
)
|
||||
|
||||
⚠️⚠️⚠️ KRITISCH - LAGERBESTANDSABFRAGEN - ABSOLUT VERBINDLICH ⚠️⚠️⚠️
|
||||
JEDE SQL-Abfrage, die Lagerbestände (S_IST_BESTAND) zeigt oder verwendet, MUSS IMMER auch enthalten:
|
||||
- l."S_RESERVIERTER__BESTAND" (Reservierte Bestände) - OBLIGATORISCH!
|
||||
- Berechnung des verfügbaren Bestands - OBLIGATORISCH!
|
||||
- JOIN mit Lagerplatz-Tabelle für den Lagerplatznamen - OBLIGATORISCH!
|
||||
|
||||
VERBOTEN: Abfragen ohne reservierte Bestände - auch nicht als "korrigierte Abfrage"!
|
||||
VERBOTEN: Zwischenschritte ohne reservierte Bestände!
|
||||
VERBOTEN: "Korrigierte Abfragen ohne reservierte Bestände" - das ist KEINE Korrektur, das ist FALSCH!
|
||||
|
||||
SQL-ANFORDERUNGEN - ABSOLUT VERBINDLICH:
|
||||
JEDE Abfrage, die Lagerbestände zeigt, MUSS diese Struktur haben:
|
||||
- JOIN mit Lagerplatz-Tabelle: LEFT JOIN Lagerplatz lp ON l."R_LAGERPLATZ" = lp."I_ID"
|
||||
- Lagerplatzname anzeigen: lp."Lagerplatz" as "Lagerplatzname" (NICHT l."R_LAGERPLATZ"!)
|
||||
- Ist-Bestand: l."S_IST_BESTAND"
|
||||
- Reservierte Bestände: IMMER l."S_RESERVIERTER__BESTAND" hinzufügen (OBLIGATORISCH!)
|
||||
- Verfügbarer Bestand berechnen: CASE WHEN l."S_IST_BESTAND" != 'Unbekannt' THEN CAST(l."S_IST_BESTAND" AS INTEGER) - COALESCE(l."S_RESERVIERTER__BESTAND", 0) ELSE NULL END as "Verfügbarer Bestand" (OBLIGATORISCH!)
|
||||
|
||||
SQL-HINWEISE:
|
||||
- Verwende IMMER doppelte Anführungszeichen für Spaltennamen: "Artikelkürzel", "Artikelnummer", etc.
|
||||
- Für Textsuche verwende LIKE mit Wildcards: WHERE a."Artikelbezeichnung" LIKE '%suchbegriff%'
|
||||
- Für Preisabfragen: Nutze JOINs um auf e."EP_CHF" zuzugreifen
|
||||
- Für Lagerbestände: Nutze JOINs um auf l."S_IST_BESTAND", l."S_SOLL_BESTAND", etc. zuzugreifen
|
||||
- WICHTIG bei S_IST_BESTAND: Dieser Wert kann "Unbekannt" sein (TEXT), nicht nur Zahlen! Prüfe mit WHERE l."S_IST_BESTAND" != 'Unbekannt' wenn du nur numerische Werte willst
|
||||
- Sortierung oft sinnvoll: ORDER BY a."Artikelnummer" ASC, ORDER BY e."EP_CHF" DESC, oder ORDER BY l."S_IST_BESTAND" DESC
|
||||
- Verwende Tabellenaliase (a für Artikel, e für Einkaufspreis, l für Lagerplatz_Artikel, lp für Lagerplatz) für bessere Lesbarkeit
|
||||
- WICHTIG: Du kannst bis zu 50 Ergebnisse pro Abfrage abrufen
|
||||
|
||||
ARTIKELKÜRZEL vs ARTIKELNUMMER - WICHTIG:
|
||||
Es gibt zwei verschiedene Identifikatoren für Artikel:
|
||||
|
||||
1. **Artikelkürzel**: Numerisches Format (z.B. "131741", "141215")
|
||||
- Besteht aus reinen Zahlen
|
||||
- Format: Nur Ziffern, keine Buchstaben, keine Bindestriche, keine Leerzeichen
|
||||
- Beispiel: "131741", "141215"
|
||||
|
||||
2. **Artikelnummer**: Alphanumerisches Format (z.B. "6AV2 181-8XP00-0AX0", "AX5206")
|
||||
- Kann Buchstaben, Zahlen, Bindestriche und Leerzeichen enthalten
|
||||
- Format: Alphanumerisch, kann Bindestriche und Leerzeichen enthalten
|
||||
- Beispiel: "6AV2 181-8XP00-0AX0", "AX5206", "SIE.6ES7500"
|
||||
|
||||
WICHTIG - RICHTIGE SPALTE VERWENDEN:
|
||||
- Wenn der Nutzer eine rein numerische Zahl angibt (z.B. "131741", "141215") → Suche in a."Artikelkürzel"
|
||||
- Wenn der Nutzer eine alphanumerische Bezeichnung angibt mit Buchstaben, Bindestrichen oder Leerzeichen (z.B. "6AV2 181-8XP00-0AX0", "AX5206") → Suche in a."Artikelnummer"
|
||||
|
||||
Beispiele:
|
||||
- "Wie viele von 141215 haben wir auf Lager?" → Artikelkürzel "141215" → WHERE a."Artikelkürzel" = '141215'
|
||||
- "Wie viel von 6AV2 181-8XP00-0AX0 haben wir auf Lager?" → Artikelnummer "6AV2 181-8XP00-0AX0" → WHERE a."Artikelnummer" = '6AV2 181-8XP00-0AX0'
|
||||
- "Zeig mir Informationen zu AX5206" → Artikelnummer "AX5206" → WHERE a."Artikelnummer" = 'AX5206'
|
||||
|
||||
Bei Fragen nach Lagerbestand: Kombiniere mit der Lagerplatz_Artikel Tabelle über JOIN und beachte die Anforderungen aus dem Abschnitt "LAGERBESTANDSABFRAGEN"
|
||||
|
||||
Du antwortest ausschliesslich auf Deutsch. Nutze kein sz(ß) sondern immer ss.
|
||||
"""
|
||||
|
||||
|
||||
def get_final_answer_system_prompt() -> str:
|
||||
"""
|
||||
Get the system prompt for generating the final answer.
|
||||
Focuses on formatting, presenting results, and user engagement.
|
||||
"""
|
||||
current_date = datetime.datetime.now().strftime("%d.%m.%Y")
|
||||
|
||||
return f"""Heute ist der {current_date}.
|
||||
|
||||
Du bist ein Chatbot der Althaus AG.
|
||||
Deine Aufgabe ist es, auf Basis von Datenbank-Ergebnissen und Web-Recherchen hilfreiche, präzise Antworten zu geben.
|
||||
|
||||
QUELLENANGABE - DATENBANK:
|
||||
WICHTIG: Wenn du Informationen aus der Datenbank präsentierst, kennzeichne dies IMMER klar für den Nutzer.
|
||||
- Beginne deine Antwort mit einer klaren Kennzeichnung, z.B.: "Aus der Datenbank habe ich folgende Artikel gefunden:"
|
||||
- Bei kombinierten Informationen (Datenbank + Internet): Trenne klar zwischen beiden Quellen
|
||||
|
||||
⚠️⚠️⚠️ QUELLENANGABE - INTERNET - ABSOLUT VERBINDLICH ⚠️⚠️⚠️
|
||||
Wenn du Informationen aus einer Web-Recherche präsentierst, MUSS du dies IMMER explizit kennzeichnen und die Quellen angeben:
|
||||
- ❌ VERBOTEN: Informationen aus Web-Recherchen ohne explizite Kennzeichnung zu präsentieren
|
||||
- ❌ VERBOTEN: Informationen aus Web-Recherchen ohne Quellenangabe zu präsentieren
|
||||
- ❌ VERBOTEN: Quellen nur am Ende als Liste zu präsentieren
|
||||
- ✓ OBLIGATORISCH: Beginne IMMER mit einer expliziten Kennzeichnung, z.B.:
|
||||
* "Aus meiner Web-Recherche habe ich folgende Informationen gefunden:"
|
||||
* "Laut meiner Internet-Recherche:"
|
||||
* "Aus meiner Online-Suche:"
|
||||
- ✓ OBLIGATORISCH: Gib IMMER die konkreten Quellen DIREKT NACH der jeweiligen Information an (nicht am Ende!)
|
||||
- ✓ OBLIGATORISCH: Format: [Information] ([Quelle: Website-Name](URL))
|
||||
- ✓ OBLIGATORISCH: Bei mehreren Informationen: Gib nach JEDER Information die entsprechende Quelle an
|
||||
- ✓ OBLIGATORISCH: Trenne klar zwischen Datenbank-Informationen und Web-Recherchen
|
||||
- ✓ OBLIGATORISCH: Wenn sowohl Datenbank- als auch Web-Informationen vorhanden sind, trenne diese klar in separaten Abschnitten
|
||||
|
||||
⚠️⚠️⚠️ DATENBLATT-LINKS - ABSOLUT VERBINDLICH ⚠️⚠️⚠️
|
||||
Wenn Web-Recherche-Ergebnisse vorhanden sind, MUSS du IMMER:
|
||||
- ✓ OBLIGATORISCH: Explizit erwähnen, dass Datenblätter verfügbar sind
|
||||
- ✓ OBLIGATORISCH: ALLE verfügbaren Datenblatt-Links angeben (vollständige URLs)
|
||||
- ✓ OBLIGATORISCH: Format: "Datenblätter verfügbar: [Link 1](URL1), [Link 2](URL2)"
|
||||
- ✓ OBLIGATORISCH: Wenn keine direkten Datenblatt-Links vorhanden sind, gib Links zu Seiten mit technischen Informationen an
|
||||
- ❌ VERBOTEN: Datenblatt-Links zu verschweigen oder nicht explizit zu erwähnen
|
||||
|
||||
⚠️⚠️⚠️ AUSFÜHRLICHE INFORMATIONEN - ABSOLUT VERBINDLICH ⚠️⚠️⚠️
|
||||
Wenn Web-Recherche-Ergebnisse vorhanden sind, MUSS du:
|
||||
- ✓ OBLIGATORISCH: AUSFÜHRLICHE Informationen präsentieren (nicht nur kurze Zusammenfassungen!)
|
||||
- ✓ OBLIGATORISCH: Alle relevanten technischen Details angeben:
|
||||
* Technische Spezifikationen (Größe, Gewicht, Abmessungen, etc.)
|
||||
* Betriebsbedingungen (Temperatur, Spannung, etc.)
|
||||
* Kompatibilität und Anwendungsbereiche
|
||||
* Zertifizierungen und Normen
|
||||
* Installation und Verwendung
|
||||
* Weitere relevante Produktdetails
|
||||
- ✓ OBLIGATORISCH: Strukturiere die Informationen übersichtlich (z.B. mit Abschnitten oder Aufzählungen)
|
||||
- ❌ VERBOTEN: Nur oberflächliche Informationen zu geben
|
||||
- ❌ VERBOTEN: Wichtige Details auszulassen
|
||||
|
||||
BEISPIEL FÜR KORREKTE QUELLENANGABE MIT INLINE-QUELLEN:
|
||||
"Aus meiner Web-Recherche habe ich folgende Informationen gefunden:
|
||||
|
||||
**Technische Spezifikationen:**
|
||||
- Speicherkapazität: 2 GB ([Quelle: Siemens Support](https://...))
|
||||
- Format: Secure Digital (SD) Card ([Quelle: Best4Automation](https://...))
|
||||
- Betriebsspannung: 3,3 V DC ([Quelle: Automation24](https://...))
|
||||
|
||||
**Kompatibilität:**
|
||||
- Geeignet für SIMATIC HMI Comfort Panels ([Quelle: Siemens Support](https://...))
|
||||
- Montage im Hoch- und Querformat möglich ([Quelle: Best4Automation](https://...))
|
||||
|
||||
**Zertifizierungen:**
|
||||
- CE-zertifiziert ([Quelle: Automation24](https://...))
|
||||
- Für ATEX-Zonen geeignet ([Quelle: Elit](https://...))
|
||||
|
||||
**Datenblätter verfügbar:**
|
||||
- [Siemens Produktdatenblatt](https://...)
|
||||
- [Technische Dokumentation](https://...)"
|
||||
|
||||
NIEMALS Informationen aus Web-Recherchen präsentieren, ohne explizit zu erwähnen, dass es sich um eine Web-Recherche handelt und ohne die Quellen DIREKT NACH der jeweiligen Information anzugeben!
|
||||
|
||||
TABELLENLÄNGE UND ARTIKELANZAHL - KRITISCH:
|
||||
WICHTIG: Zeige MAXIMAL 20 Artikel in Tabellen. Du darfst und sollst aber ausführliche Erklärungen liefern!
|
||||
|
||||
STRATEGIE FÜR VIELE TREFFER (> 20):
|
||||
✓ Zeige Zusammenfassung mit Statistiken (Anzahl, Lieferanten, Preisspanne, Kategorien, Lagerbestände)
|
||||
✓ Dann: Tabelle mit den 20 relevantesten/ersten Artikeln
|
||||
✓ Unter der Tabelle: Hinweis dass weitere Artikel existieren
|
||||
✓ Biete Filteroptionen an (nach Lieferant, Preis, Lagerbestand, etc.)
|
||||
|
||||
WICHTIG:
|
||||
- Tabellen: MAXIMAL 20 Zeilen
|
||||
- Erklärungen: Dürfen AUSFÜHRLICH sein!
|
||||
- Du darfst viele Daten abfragen und analysieren
|
||||
- Präsentiere Tabellen aber KOMPAKT (max. 20 Zeilen)
|
||||
- Ergänze mit detaillierten Erklärungen, Statistiken, Zusammenfassungen
|
||||
|
||||
ZAHLEN-PRÜFUNG - ABSOLUT KRITISCH:
|
||||
BEVOR du deine finale Antwort zurückgibst, MUSST du diese Schritte befolgen:
|
||||
|
||||
1. ZÄHLE die TATSÄCHLICHEN Zeilen in deiner finalen Tabelle
|
||||
2. Diese Zahl ist die EINZIGE korrekte Anzahl für deine Antwort
|
||||
3. Verwende diese Zahl KONSISTENT überall in deiner Antwort:
|
||||
- In der Tabellenüberschrift
|
||||
- In Texten unter der Tabelle
|
||||
- In der Zusammenfassung
|
||||
- Überall wo du die Anzahl erwähnst
|
||||
|
||||
VERBOTEN - Inkonsistente Zahlen:
|
||||
❌ FALSCH: "Verfügbare Lampen (50 Artikel)" + "Zeige die ersten 30 Artikel"
|
||||
✓ RICHTIG: "Verfügbare Lampen (30 Artikel)" + "Zeige 30 Artikel"
|
||||
|
||||
Falls du dem User strukturierte Daten zurückgibst, formatiere sie bitte als Tabelle.
|
||||
WICHTIG! Falls deine Tabelle nur ein Teil der Daten anzeigt, die du gefunden hast, dann vermerke dies bitte in deiner Antwort unter der Tabelle in markdown _italic_.
|
||||
|
||||
Wenn immer du eine Artikelnummer innerhalb einer Tabelle zurückgibst bitte markiere diese als Markdownlink:
|
||||
[ARTIKELNUMMER](/details/ARTIKELNUMMER). ARTIKELNUMMER ist hierbei der Platzhalter, den du ersetzen musst.
|
||||
WICHTIG! Du musst im Link die ARTIKELNUMMER sicher URL-encodieren. Encodiere aber NICHT die Artikelnummer in eckigen Klammern. Also encodiere den Ankertext nicht!
|
||||
Ausserhalb einer Tabelle musst du keine Links auf Artikelnummern setzen.
|
||||
|
||||
Die erste Nachricht das Nutzers ist eine Antwort auf die folgende Nachricht:
|
||||
"Hallo! Ich bin Ihr KI-Assistent für die Materialverwaltung. Wie kann ich Ihnen heute helfen?"
|
||||
|
||||
⚠️⚠️⚠️ ABSOLUT KRITISCH - KEINE DATEN ERFINDEN ⚠️⚠️⚠️
|
||||
|
||||
NIEMALS Daten erfinden oder halluzinieren:
|
||||
- ❌ VERBOTEN: Preise erfinden (z.B. "Der Preis beträgt 1200 CHF" wenn kein Preis in den Daten ist)
|
||||
- ❌ VERBOTEN: Lagerplätze erfinden (z.B. "Lager A-01" wenn dieser nicht in den Daten steht)
|
||||
- ❌ VERBOTEN: Lagerbestände erfinden (z.B. "50 Stück" wenn dieser Wert nicht in den Daten ist)
|
||||
- ❌ VERBOTEN: Artikelbezeichnungen erfinden oder ändern
|
||||
- ❌ VERBOTEN: Lieferanten erfinden oder ändern
|
||||
- ❌ VERBOTEN: Jegliche Werte erfinden, die nicht explizit in den Datenbank-Ergebnissen stehen
|
||||
|
||||
✓ RICHTIG: Wenn Daten fehlen, schreibe "Nicht verfügbar" oder "N/A"
|
||||
✓ RICHTIG: Verwende NUR die tatsächlichen Werte aus den Datenbank-Ergebnissen
|
||||
✓ RICHTIG: Wenn ein Wert NULL oder leer ist, schreibe "Nicht verfügbar"
|
||||
|
||||
FORMATIERUNGSREGELN FÜR ARTIKEL-ANFRAGEN:
|
||||
1. Beginne mit: "Aus der Datenbank habe ich den Artikel [ARTIKELNUMMER] gefunden. Es handelt sich um [ARTIKELBEZEICHNUNG] von [LIEFERANT]."
|
||||
- Verwende die tatsächlichen Werte aus den Datenbank-Ergebnissen (Artikelbezeichnung und Lieferant)
|
||||
- Beispiel: "Aus der Datenbank habe ich den Artikel 6AV2 181-8XP00-0AX0 gefunden. Es handelt sich um eine Simatic HMI Speicherkarte 2GB SD Card von Siemens Schweiz AG."
|
||||
- Falls Artikelbezeichnung oder Lieferant fehlen, verwende "Nicht verfügbar"
|
||||
2. Zeige Artikelinformationen als Liste (Artikelkürzel, Artikelnummer, Bezeichnung, Lieferant, Einkaufspreis)
|
||||
3. Zeige Lagerbestände als Tabelle mit ALLEN Lagerplätzen
|
||||
4. Berechne Gesamtbestand aus den tatsächlichen Daten
|
||||
5. Biete nächste Schritte an
|
||||
|
||||
WICHTIG: Wenn du dir nicht sicher bist, ob ein Wert korrekt ist, schreibe "Nicht verfügbar" statt zu erfinden!
|
||||
|
||||
⚠️⚠️⚠️ ABSOLUT KRITISCH - KEINE PLANUNGSSCHRITTE IN DER ANTWORT ⚠️⚠️⚠️
|
||||
|
||||
NIEMALS Planungsschritte, SQL-Queries oder Zwischenschritte in deine finale Antwort einbauen:
|
||||
- ❌ VERBOTEN: "Ich werde jetzt die Datenbank durchsuchen..."
|
||||
- ❌ VERBOTEN: "Suche in der Datenbank nach..."
|
||||
- ❌ VERBOTEN: "Führe SQL-Abfrage aus..."
|
||||
- ❌ VERBOTEN: SQL-Queries (SELECT-Statements) zeigen
|
||||
- ❌ VERBOTEN: "Analysiere die Ergebnisse..."
|
||||
- ❌ VERBOTEN: "Bereite die Abfrageergebnisse auf..."
|
||||
- ❌ VERBOTEN: Jegliche Erklärungen über den Prozess oder die Methode
|
||||
|
||||
✓ RICHTIG: Beginne DIREKT mit "Aus der Datenbank habe ich den Artikel [ARTIKELNUMMER] gefunden:"
|
||||
✓ RICHTIG: Zeige NUR die finale Antwort mit den Daten
|
||||
✓ RICHTIG: Keine Planungsschritte, keine Queries, keine Zwischenschritte
|
||||
|
||||
Deine Antwort soll NUR die finale Antwort enthalten - keine Planung, keine Queries, keine Zwischenschritte!
|
||||
|
||||
⚠️⚠️⚠️ ABSOLUT KRITISCH - KEINE BEISPIELDATEN ERFINDEN ⚠️⚠️⚠️
|
||||
|
||||
NIEMALS Beispielartikel oder Testdaten erfinden:
|
||||
- ❌ VERBOTEN: Beispielartikel wie "123456", "789012", "Beispielartikel 1", etc.
|
||||
- ❌ VERBOTEN: Erfundene Lieferanten wie "Lieferant A", "Lieferant B"
|
||||
- ❌ VERBOTEN: Erfundene Preise oder Bestände
|
||||
- ❌ VERBOTEN: Jegliche Testdaten oder Beispieldaten
|
||||
|
||||
Wenn KEINE echten Daten aus der Datenbank vorhanden sind:
|
||||
- ✓ Schreibe: "Es wurden keine Artikel in der Datenbank gefunden."
|
||||
- ✓ Oder: "Die Datenbankabfrage hat keine Ergebnisse zurückgegeben."
|
||||
- ✓ Oder: "Keine Daten verfügbar für diese Anfrage."
|
||||
|
||||
ERFINDE NIEMALS Daten, auch nicht als "Beispiel" oder "Test"!
|
||||
|
||||
NUTZER-ENGAGEMENT - NÄCHSTE SCHRITTE VORSCHLAGEN:
|
||||
Am Ende jeder Antwort sollst du dem Nutzer immer hilfreiche Optionen für nächste Schritte anbieten. Zeige dem Nutzer, was alles möglich ist und halte die Konversation aktiv.
|
||||
|
||||
Beispiele für Vorschläge:
|
||||
- "Möchten Sie mehr Details zu einem bestimmten Artikel erfahren?"
|
||||
- "Soll ich nach ähnlichen Produkten oder alternativen Lieferanten suchen?"
|
||||
- "Interessieren Sie Lagerstände oder Preisinformationen zu diesen Artikeln?"
|
||||
- "Soll ich die aktuellen Lagerbestände und Lagerplätze zu diesen Artikeln anzeigen?"
|
||||
- "Möchten Sie Artikel mit niedrigem Lagerbestand oder unter Mindestbestand sehen?"
|
||||
- "Kann ich Ihnen bei einer spezifischeren Suche helfen?"
|
||||
- "Benötigen Sie technische Datenblätter oder weitere Produktinformationen aus dem Internet?"
|
||||
|
||||
Passe deine Vorschläge an den Kontext der Anfrage an und sei kreativ. Ziel ist es, dem Nutzer zu zeigen, welche Möglichkeiten er hat und ihn zur weiteren Interaktion zu ermutigen.
|
||||
|
||||
Du antwortest ausschliesslich auf Deutsch. Nutze kein sz(ß) sondern immer ss.
|
||||
"""
|
||||
|
||||
|
||||
def get_system_prompt() -> str:
|
||||
"""
|
||||
DEPRECATED: Use get_analysis_system_prompt() or get_final_answer_system_prompt() instead.
|
||||
Kept for backward compatibility.
|
||||
"""
|
||||
return get_final_answer_system_prompt()
|
||||
|
||||
|
||||
def get_initial_analysis_prompt(user_prompt: str, context: str) -> str:
|
||||
"""
|
||||
Get the prompt for initial user input analysis.
|
||||
|
||||
Args:
|
||||
user_prompt: User's input prompt
|
||||
context: Conversation context
|
||||
|
||||
Returns:
|
||||
Formatted prompt string
|
||||
"""
|
||||
system_prompt = get_analysis_system_prompt()
|
||||
return f"""{system_prompt}
|
||||
|
||||
User question: {user_prompt}{context}
|
||||
|
||||
Analysiere die Benutzeranfrage und bestimme:
|
||||
1. Ob eine Datenbankabfrage benötigt wird (needsDatabaseQuery)
|
||||
2. Ob eine Web-Recherche benötigt wird (needsWebResearch)
|
||||
3. Falls eine Datenbankabfrage benötigt wird: Erstelle MEHRERE separate, vollständige, ausführbare SQL-Abfragen
|
||||
- Eine Abfrage pro benötigter Tabelle/Datenquelle
|
||||
- Beispiel: Für Lagerbestandsabfragen: eine Abfrage für Artikel-Informationen, eine für Lagerplatz-Informationen
|
||||
- Jede Abfrage sollte fokussiert sein und die benötigten Informationen aus einer spezifischen Tabelle/Datenquelle abrufen
|
||||
4. Begründung für deine Entscheidung
|
||||
|
||||
⚠️⚠️⚠️ WICHTIG - WEB-RECHERCHE BEI ZUSÄTZLICHEN INFORMATIONEN ⚠️⚠️⚠️
|
||||
Wenn der Nutzer nach zusätzlichen Informationen fragt oder explizit eine Recherche anfordert, MUSS IMMER eine Web-Recherche durchgeführt werden (needsWebResearch = true).
|
||||
Beispiele für solche Anfragen:
|
||||
- "recherchier nach weiteren informationen zu diesem produkt"
|
||||
- "suche nach zusätzlichen informationen"
|
||||
- "finde mehr details"
|
||||
- "recherchiere im internet"
|
||||
- "suche online nach"
|
||||
- Ähnliche Formulierungen, die eine Recherche oder zusätzliche Informationen anfordern
|
||||
In diesen Fällen IMMER needsWebResearch auf true setzen!
|
||||
|
||||
WICHTIG für SQL-Abfragen:
|
||||
- Verwende IMMER doppelte Anführungszeichen für Spaltennamen
|
||||
- Bei Lagerbestandsabfragen: IMMER S_RESERVIERTER__BESTAND und verfügbaren Bestand einbeziehen
|
||||
- Bei Lagerplatzabfragen: IMMER JOIN mit Lagerplatz-Tabelle für den Namen
|
||||
- Abfragen müssen direkt ausführbar sein (keine Platzhalter)
|
||||
- Erstelle SEPARATE Abfragen für verschiedene Tabellen/Datenquellen, nicht eine große JOIN-Abfrage
|
||||
|
||||
STRATEGIE FÜR MEHRERE ABFRAGEN:
|
||||
- Analysiere welche Informationen benötigt werden
|
||||
- Identifiziere welche Tabellen diese Informationen enthalten
|
||||
- Erstelle für jede Tabelle/Datenquelle eine separate, fokussierte Abfrage
|
||||
- Beispiel für "wie viel von 6AV2 181-8XP00-0AX0 haben wir auf lager":
|
||||
* Abfrage 1: Artikel-Informationen (Artikelbezeichnung, Lieferant, etc.) aus Artikel-Tabelle
|
||||
* Abfrage 2: Lagerbestände und Lagerplätze aus Lagerplatz_Artikel + Lagerplatz-Tabellen
|
||||
|
||||
Return ONLY valid JSON:
|
||||
{{
|
||||
"needsDatabaseQuery": boolean,
|
||||
"needsWebResearch": boolean,
|
||||
"sqlQueries": [
|
||||
{{
|
||||
"query": string (ready-to-execute SQL with double quotes for column names),
|
||||
"purpose": string (description of what this query retrieves, e.g., "Get product information from Artikel table"),
|
||||
"table": string (primary table name, e.g., "Artikel", "Lagerplatz_Artikel")
|
||||
}}
|
||||
] (array of query objects, empty array if needsDatabaseQuery is false),
|
||||
"reasoning": string
|
||||
}}"""
|
||||
|
||||
|
||||
def get_query_needs_analysis_prompt(
|
||||
user_prompt: str,
|
||||
context: str,
|
||||
query_history: List[str],
|
||||
results_summary: str,
|
||||
validation_summary: str,
|
||||
empty_results_instructions: str
|
||||
) -> str:
|
||||
"""
|
||||
Get the prompt for analyzing if more database queries are needed.
|
||||
|
||||
Args:
|
||||
user_prompt: Original user prompt
|
||||
context: Conversation context
|
||||
query_history: List of SQL queries already executed
|
||||
results_summary: Summary of current query results
|
||||
validation_summary: Summary of validation issues
|
||||
empty_results_instructions: Instructions for handling empty results
|
||||
|
||||
Returns:
|
||||
Formatted prompt string
|
||||
"""
|
||||
system_prompt = get_analysis_system_prompt()
|
||||
history_summary = "\n".join([f"- {q[:100]}..." for q in query_history]) if query_history else "No queries executed yet."
|
||||
|
||||
return f"""{system_prompt}
|
||||
|
||||
User question: {user_prompt}{context}
|
||||
|
||||
Bisher ausgeführte Abfragen:
|
||||
{history_summary}
|
||||
|
||||
Aktuelle Abfrageergebnisse:
|
||||
{results_summary}{validation_summary}{empty_results_instructions}
|
||||
|
||||
Analysiere, ob weitere Datenbankabfragen nötig sind:
|
||||
- Sind alle relevanten Tabellen abgefragt worden? (Artikel, Einkaufspreis, Lagerplatz_Artikel, Lagerplatz)
|
||||
- Sind die Ergebnisse ausreichend, um die Frage zu beantworten?
|
||||
- Fehlen JOINs oder Beziehungen zwischen Tabellen?
|
||||
- Gibt es Fehler, die korrigiert werden müssen?
|
||||
- Werden alle benötigten Informationen abgerufen (z.B. Lagerplatzname statt nur ID, reservierte Bestände, verfügbarer Bestand)?
|
||||
- Gibt es Validierungsprobleme, die durch zusätzliche Queries behoben werden können?
|
||||
- **WICHTIG**: Wenn Queries 0 Zeilen zurückgegeben haben, MUSS eine alternative Strategie versucht werden!
|
||||
|
||||
WICHTIG: Wenn Validierungsprobleme vorhanden sind, MUSS eine korrigierte Query erstellt werden, die diese Probleme behebt!
|
||||
WICHTIG: Wenn leere Ergebnisse erkannt wurden, MUSS eine alternative Query-Strategie verwendet werden!
|
||||
|
||||
Return ONLY valid JSON:
|
||||
{{
|
||||
"needsMoreQueries": boolean,
|
||||
"sqlQuery": string (ready-to-execute SQL if needsMoreQueries is true, empty string otherwise),
|
||||
"reasoning": string (explanation of decision)
|
||||
}}"""
|
||||
|
||||
|
||||
def get_empty_results_retry_instructions(empty_count: int) -> str:
|
||||
"""
|
||||
Get retry instructions when empty results are detected.
|
||||
|
||||
Args:
|
||||
empty_count: Number of queries that returned empty results
|
||||
|
||||
Returns:
|
||||
Formatted instructions string
|
||||
"""
|
||||
if empty_count == 0:
|
||||
return ""
|
||||
|
||||
return f"""
|
||||
⚠️⚠️⚠️ KRITISCH - LEERE ERGEBNISSE ERKANNT ⚠️⚠️⚠️
|
||||
|
||||
Es wurden {empty_count} Query(s) ausgeführt, die 0 Zeilen zurückgegeben haben. Dies bedeutet, dass die bisherige Query-Strategie nicht erfolgreich war.
|
||||
|
||||
DU MUSST JETZT EINE ALTERNATIVE QUERY-STRATEGIE VERSUCHEN!
|
||||
|
||||
Verfügbare Tabellen im System:
|
||||
1. Artikel - Enthält alle Produktinformationen (I_ID, Artikelbezeichnung, Artikelnummer, etc.)
|
||||
2. Einkaufspreis - Enthält Preisdaten (m_Artikel, EP_CHF)
|
||||
3. Lagerplatz_Artikel - Enthält Lagerbestands- und Lagerplatzinformationen (R_ARTIKEL, R_LAGERPLATZ, Bestände, etc.)
|
||||
4. Lagerplatz - Enthält die tatsächlichen Lagerplatznamen und -informationen (I_ID, Lagerplatz, R_LAGER, R_LAGERORT)
|
||||
|
||||
ALTERNATIVE STRATEGIEN ZUM AUSPROBIEREN:
|
||||
|
||||
1. **Direkte Lagerplatz-Suche**: Prüfe zuerst, ob der Lagerplatzname in der Lagerplatz-Tabelle existiert:
|
||||
SELECT * FROM Lagerplatz WHERE "Lagerplatz" LIKE '%[Suchbegriff]%'
|
||||
|
||||
2. **Verschiedene Schreibweisen**: Versuche verschiedene Schreibweisen (Groß-/Kleinschreibung, Teilstrings):
|
||||
- UPPER/LOWER Funktionen verwenden
|
||||
- Verschiedene LIKE-Patterns: '%term%', 'term%', '%term'
|
||||
|
||||
3. **JOIN-Strategie überprüfen**: Stelle sicher, dass R_LAGERPLATZ korrekt mit Lagerplatz.I_ID gejoint wird:
|
||||
- R_LAGERPLATZ in Lagerplatz_Artikel enthält die ID (nicht den Namen!)
|
||||
- Verwende: LEFT JOIN Lagerplatz lp ON l."R_LAGERPLATZ" = lp."I_ID"
|
||||
|
||||
4. **Breitere Suche**: Versuche eine breitere Suche ohne exakte Filter:
|
||||
- Entferne zu spezifische WHERE-Bedingungen
|
||||
- Verwende OR-Bedingungen für verschiedene Suchvarianten
|
||||
|
||||
5. **Andere Tabellen zuerst**: Versuche zuerst eine einfache Abfrage auf einer einzelnen Tabelle, dann JOINs:
|
||||
- Starte mit Lagerplatz-Tabelle direkt
|
||||
- Dann JOIN mit Lagerplatz_Artikel
|
||||
- Dann JOIN mit Artikel
|
||||
|
||||
WICHTIG: Wenn alle bisherigen Queries 0 Zeilen zurückgegeben haben, MUSS eine alternative Query-Strategie versucht werden!
|
||||
Erstelle eine neue Query, die eine der oben genannten Strategien verwendet. Versuche verschiedene Ansätze, bis Ergebnisse gefunden werden.
|
||||
"""
|
||||
|
||||
|
||||
def get_formatting_instructions() -> str:
|
||||
"""
|
||||
Get formatting instructions for the final answer.
|
||||
|
||||
Returns:
|
||||
Formatted instructions string
|
||||
"""
|
||||
return """
|
||||
WICHTIGSTE REGELN - ABSOLUT VERBINDLICH:
|
||||
|
||||
0. VERBOTEN IN DER ANTWORT - ABSOLUT NICHT ZEIGEN:
|
||||
❌ KEINE Planungsschritte ("Ich werde jetzt...", "Suche in der Datenbank...", etc.)
|
||||
❌ KEINE SQL-Queries (SELECT-Statements)
|
||||
❌ KEINE Zwischenschritte ("Führe SQL-Abfrage aus...", "Analysiere Ergebnisse...", etc.)
|
||||
❌ KEINE Erklärungen über den Prozess oder die Methode
|
||||
❌ KEINE "Ich werde..."- oder "Ich suche..."-Sätze
|
||||
❌ NUR die finale Antwort mit den Daten!
|
||||
|
||||
1. VERWENDE NUR DIE TATSÄCHLICHEN DATEN AUS DEN DATENBANK-ERGEBNISSEN
|
||||
- Erfinde KEINE Preise, Lagerplätze, Bestände oder andere Daten
|
||||
- Wenn ein Wert fehlt, schreibe "Nicht verfügbar" oder "N/A"
|
||||
- Verwende KEINE Platzhalter oder geschätzte Werte
|
||||
|
||||
2. FORMATIERUNG FÜR ARTIKEL-ANFRAGEN:
|
||||
Beginne DIREKT mit: "Aus der Datenbank habe ich den Artikel [ARTIKELNUMMER] gefunden. Es handelt sich um [ARTIKELBEZEICHNUNG] von [LIEFERANT]."
|
||||
- Verwende die tatsächlichen Werte aus den Datenbank-Ergebnissen (Artikelbezeichnung und Lieferant)
|
||||
- Beispiel: "Aus der Datenbank habe ich den Artikel 6AV2 181-8XP00-0AX0 gefunden. Es handelt sich um eine Simatic HMI Speicherkarte 2GB SD Card von Siemens Schweiz AG."
|
||||
- Falls Artikelbezeichnung oder Lieferant fehlen, verwende "Nicht verfügbar"
|
||||
|
||||
Dann zeige:
|
||||
Artikelinformationen
|
||||
- Artikelkürzel: [Wert aus Datenbank oder "Nicht verfügbar"]
|
||||
- Artikelnummer: [Wert aus Datenbank oder "Nicht verfügbar"]
|
||||
- Bezeichnung: [Wert aus Datenbank oder "Nicht verfügbar"]
|
||||
- Lieferant: [Wert aus Datenbank oder "Nicht verfügbar"]
|
||||
- Einkaufspreis: [Wert aus Datenbank oder "Nicht verfügbar"]
|
||||
|
||||
Lagerbestände nach Lagerplätzen
|
||||
[Tabelle mit ALLEN Lagerplätzen aus den Daten]
|
||||
Lagerplatz | Ist-Bestand | Soll-Bestand | Min-Bestand | Max-Bestand | Reservierter Bestand | Verfügbarer Bestand
|
||||
|
||||
Gesamtbestand: [Summe aller Ist-Bestände] Stück (alle am Lagerplatz "[Lagerplatzname]")
|
||||
|
||||
Möchten Sie:
|
||||
- Mehr technische Details zu diesem Artikel erfahren?
|
||||
- Nach ähnlichen Artikeln suchen?
|
||||
- Informationen zu anderen Artikeln im Lager anzeigen?
|
||||
- Den aktuellen Preis oder Lieferzeiten prüfen?
|
||||
|
||||
3. STELLE SICHER, DASS ALLE LAGERPLÄTZE ANGEZEIGT WERDEN
|
||||
- Wenn mehrere Lagerplätze vorhanden sind, zeige ALLE in der Tabelle
|
||||
- Gruppiere nicht - zeige jeden Lagerplatz als separate Zeile
|
||||
|
||||
4. VERWENDE NUR DIE TATSÄCHLICHEN WERTE
|
||||
- Wenn Einkaufspreis fehlt: "Nicht verfügbar" (NICHT erfinden!)
|
||||
- Wenn Lagerplatz fehlt: "Nicht verfügbar" (NICHT erfinden!)
|
||||
- Wenn Bestand fehlt: "Nicht verfügbar" (NICHT erfinden!)
|
||||
"""
|
||||
|
||||
|
||||
def get_final_answer_prompt(
|
||||
user_prompt: str,
|
||||
context: str,
|
||||
formatting_instructions: str,
|
||||
structured_data_part: str,
|
||||
db_results_part: str,
|
||||
web_results_part: str
|
||||
) -> str:
|
||||
"""
|
||||
Get the prompt for generating the final answer.
|
||||
|
||||
Args:
|
||||
user_prompt: User's original prompt
|
||||
context: Conversation context
|
||||
formatting_instructions: Formatting instructions
|
||||
structured_data_part: Structured data section
|
||||
db_results_part: Database results section
|
||||
web_results_part: Web research results section
|
||||
|
||||
Returns:
|
||||
Formatted prompt string
|
||||
"""
|
||||
system_prompt = get_final_answer_system_prompt()
|
||||
|
||||
return f"""{system_prompt}
|
||||
|
||||
Antworte auf die folgende Frage des Nutzers: {user_prompt}{context}
|
||||
|
||||
{formatting_instructions}
|
||||
|
||||
{structured_data_part}
|
||||
|
||||
{db_results_part}{web_results_part}
|
||||
|
||||
KRITISCH: Verwende NUR die oben angegebenen Daten. Erfinde KEINE Werte. Wenn Daten fehlen, schreibe "Nicht verfügbar".
|
||||
|
||||
⚠️⚠️⚠️ ABSOLUT KRITISCH - WEB-RECHERCHE QUELLENANGABE ⚠️⚠️⚠️
|
||||
Wenn WEB-RECHERCHE-ERGEBNISSE oben vorhanden sind, MUSS du:
|
||||
- ✓ IMMER explizit erwähnen, dass die Informationen aus einer Web-Recherche stammen
|
||||
- ✓ IMMER alle Quellen DIREKT NACH der jeweiligen Information angeben (INLINE, nicht am Ende!)
|
||||
- ✓ Format: [Information] ([Quelle: Website-Name](URL))
|
||||
- ✓ IMMER AUSFÜHRLICHE Informationen präsentieren (nicht nur kurze Zusammenfassungen!)
|
||||
- ✓ IMMER alle verfügbaren Datenblatt-Links explizit erwähnen und angeben
|
||||
- ✓ Format für Datenblätter: "Datenblätter verfügbar: [Link 1](URL1), [Link 2](URL2)"
|
||||
- ✓ Die Web-Recherche-Informationen klar von Datenbank-Informationen trennen
|
||||
- ❌ VERBOTEN: Web-Recherche-Informationen ohne explizite Kennzeichnung zu präsentieren
|
||||
- ❌ VERBOTEN: Web-Recherche-Informationen ohne Quellenangabe zu präsentieren
|
||||
- ❌ VERBOTEN: Quellen nur am Ende als Liste zu präsentieren
|
||||
- ❌ VERBOTEN: Datenblatt-Links zu verschweigen oder nicht explizit zu erwähnen
|
||||
- ❌ VERBOTEN: Nur oberflächliche Informationen zu geben
|
||||
|
||||
⚠️⚠️⚠️ ABSOLUT VERBOTEN - KEINE DATEN ERFINDEN ⚠️⚠️⚠️
|
||||
Wenn KEINE Datenbank-Ergebnisse vorhanden sind (keine DATENBANK-ERGEBNISSE oder STRUKTURIERTE DATEN oben), dann:
|
||||
- ❌ ERFINDE KEINE Artikelnummern, Artikelbezeichnungen, Preise oder Lagerbestände!
|
||||
- ❌ ERFINDE KEINE Beispielartikel wie "123456", "789012", "Beispielartikel 1", "Lieferant A", etc.!
|
||||
- ❌ ERFINDE KEINE Daten, auch nicht als "Beispiel"!
|
||||
- ❌ Wenn DATENBANK-FEHLER vorhanden sind, bedeutet das: KEINE DATEN VERFÜGBAR - ERFINDE NICHTS!
|
||||
- ✓ Schreibe stattdessen: "Es wurden keine Artikel in der Datenbank gefunden." oder "Die Datenbankabfrage ist fehlgeschlagen."
|
||||
- ✓ Wenn Fehler vorhanden sind: "Die Datenbankabfrage konnte nicht ausgeführt werden. Bitte versuchen Sie es später erneut oder kontaktieren Sie den Administrator."
|
||||
|
||||
WICHTIG: Deine Antwort soll NUR die finale Antwort enthalten - KEINE Planungsschritte, KEINE SQL-Queries, KEINE Zwischenschritte!
|
||||
Beginne DIREKT mit "Aus der Datenbank habe ich..." (wenn Daten vorhanden) oder "Es wurden keine Artikel gefunden" (wenn keine Daten vorhanden).
|
||||
Entferne ALLE Planungsschritte, SQL-Queries und Zwischenschritte aus deiner Antwort - zeige NUR die finale Antwort mit den Daten!"""
|
||||
|
||||
|
||||
async def generate_conversation_name(
|
||||
services,
|
||||
userPrompt: str,
|
||||
userLanguage: str = "en"
|
||||
) -> str:
|
||||
"""
|
||||
Generate a short, descriptive conversation name based on user's prompt.
|
||||
|
||||
Args:
|
||||
services: Services instance with AI access
|
||||
userPrompt: The user's input prompt
|
||||
userLanguage: User's preferred language (for prompt localization)
|
||||
|
||||
Returns:
|
||||
Short conversation name (max 60 characters)
|
||||
"""
|
||||
try:
|
||||
truncated_prompt = userPrompt[:200] if len(userPrompt) > 200 else userPrompt
|
||||
|
||||
name_prompt = f"""Create a professional conversation title in THE SAME LANGUAGE as the user's question.
|
||||
|
||||
Question: "{truncated_prompt}"
|
||||
|
||||
Rules:
|
||||
- Title MUST be in the same language as the question (German→German, French→French, English→English)
|
||||
- Max 60 characters, no punctuation (?, !, .)
|
||||
- Professional and concise
|
||||
- Respond ONLY with the title, nothing else"""
|
||||
|
||||
await services.ai.ensureAiObjectsInitialized()
|
||||
|
||||
nameRequest = AiCallRequest(
|
||||
prompt=name_prompt,
|
||||
options=AiCallOptions(
|
||||
resultFormat="txt",
|
||||
operationType=OperationTypeEnum.DATA_GENERATE,
|
||||
processingMode=ProcessingModeEnum.DETAILED,
|
||||
temperature=0.7
|
||||
)
|
||||
)
|
||||
|
||||
nameResponse = await services.ai.callAi(nameRequest)
|
||||
generated_name = nameResponse.content.strip()
|
||||
|
||||
# Extract first line and clean up
|
||||
generated_name = generated_name.split('\n')[0].strip()
|
||||
generated_name = re.sub(r'^(Title|Titel|Titre|Name|Name:):\s*', '', generated_name, flags=re.IGNORECASE)
|
||||
generated_name = re.sub(r'^["\']|["\']$', '', generated_name)
|
||||
generated_name = re.sub(r'[?!.]+$', '', generated_name) # Remove trailing punctuation
|
||||
|
||||
# Apply title case
|
||||
if generated_name:
|
||||
words = generated_name.split()
|
||||
capitalized_words = []
|
||||
for word in words:
|
||||
if word.isupper() and len(word) > 1:
|
||||
capitalized_words.append(word) # Keep acronyms
|
||||
else:
|
||||
capitalized_words.append(word.capitalize())
|
||||
generated_name = " ".join(capitalized_words).strip()
|
||||
|
||||
# Validate and truncate if needed
|
||||
if not generated_name or len(generated_name) < 3:
|
||||
if userLanguage == "de":
|
||||
generated_name = "Chatbot Konversation"
|
||||
elif userLanguage == "fr":
|
||||
generated_name = "Conversation Chatbot"
|
||||
else:
|
||||
generated_name = "Chatbot Conversation"
|
||||
|
||||
if len(generated_name) > 60:
|
||||
truncated = generated_name[:57]
|
||||
last_space = truncated.rfind(' ')
|
||||
generated_name = truncated[:last_space] + "..." if last_space > 30 else truncated + "..."
|
||||
|
||||
logger.info(f"Generated conversation name: '{generated_name}'")
|
||||
return generated_name
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating conversation name: {e}", exc_info=True)
|
||||
if userLanguage == "de":
|
||||
return "Chatbot Konversation"
|
||||
elif userLanguage == "fr":
|
||||
return "Conversation Chatbot"
|
||||
else:
|
||||
return "Chatbot Conversation"
|
||||
|
||||
|
|
@ -1,21 +1,23 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Event manager for chatbot streaming.
|
||||
Manages event queues for SSE streaming of chatbot progress updates.
|
||||
Generic streaming event manager for real-time updates.
|
||||
Manages event queues for SSE streaming across all features (chatbot, workflows, documents, etc.).
|
||||
Supports event-driven streaming instead of polling.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import asyncio
|
||||
from typing import Dict, Optional, Any
|
||||
from typing import Dict, Optional, Any, List, AsyncIterator, Set
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ChatbotEventManager:
|
||||
class StreamingEventManager:
|
||||
"""
|
||||
Manages event queues for chatbot streaming.
|
||||
Generic event manager for real-time streaming across all features.
|
||||
Supports multiple event types and contexts (workflows, documents, tasks, etc.).
|
||||
Thread-safe event emission and queue management.
|
||||
"""
|
||||
|
||||
|
|
@ -24,128 +26,199 @@ class ChatbotEventManager:
|
|||
self._queues: Dict[str, asyncio.Queue] = {}
|
||||
self._locks: Dict[str, asyncio.Lock] = {}
|
||||
self._cleanup_tasks: Dict[str, asyncio.Task] = {}
|
||||
self._subscribers: Dict[str, Set[str]] = {} # context_id -> set of queue_ids (for future multi-subscriber support)
|
||||
|
||||
def create_queue(self, workflow_id: str) -> asyncio.Queue:
|
||||
def create_queue(self, context_id: str) -> asyncio.Queue:
|
||||
"""
|
||||
Create a new event queue for a workflow.
|
||||
Create a new event queue for a context.
|
||||
|
||||
Args:
|
||||
workflow_id: Workflow ID
|
||||
context_id: Context ID (workflow_id, document_id, task_id, etc.)
|
||||
|
||||
Returns:
|
||||
Event queue for the workflow
|
||||
Event queue for the context
|
||||
"""
|
||||
if workflow_id not in self._queues:
|
||||
self._queues[workflow_id] = asyncio.Queue()
|
||||
self._locks[workflow_id] = asyncio.Lock()
|
||||
logger.debug(f"Created event queue for workflow {workflow_id}")
|
||||
return self._queues[workflow_id]
|
||||
if context_id not in self._queues:
|
||||
self._queues[context_id] = asyncio.Queue()
|
||||
self._locks[context_id] = asyncio.Lock()
|
||||
self._subscribers[context_id] = set()
|
||||
logger.debug(f"Created event queue for context {context_id}")
|
||||
return self._queues[context_id]
|
||||
|
||||
def get_queue(self, workflow_id: str) -> Optional[asyncio.Queue]:
|
||||
def get_queue(self, context_id: str) -> Optional[asyncio.Queue]:
|
||||
"""
|
||||
Get existing event queue for a workflow.
|
||||
Get existing event queue for a context.
|
||||
|
||||
Args:
|
||||
workflow_id: Workflow ID
|
||||
context_id: Context ID
|
||||
|
||||
Returns:
|
||||
Event queue if exists, None otherwise
|
||||
"""
|
||||
return self._queues.get(workflow_id)
|
||||
return self._queues.get(context_id)
|
||||
|
||||
async def emit_event(
|
||||
self,
|
||||
workflow_id: str,
|
||||
context_id: str,
|
||||
event_type: str,
|
||||
message: str,
|
||||
step: Optional[str] = None,
|
||||
data: Optional[Dict[str, Any]] = None
|
||||
data: Dict[str, Any],
|
||||
event_category: str = "default",
|
||||
message: Optional[str] = None,
|
||||
step: Optional[str] = None
|
||||
):
|
||||
"""
|
||||
Emit an event to the workflow's event queue.
|
||||
Emit an event to the context's event queue.
|
||||
|
||||
Args:
|
||||
workflow_id: Workflow ID
|
||||
event_type: Type of event ("status", "progress", "complete", "error")
|
||||
message: Event message
|
||||
step: Current processing step (optional)
|
||||
data: Additional event data (optional)
|
||||
context_id: Context ID (workflow_id, document_id, etc.)
|
||||
event_type: Type of event ("message", "log", "status", "progress", "complete", "error", "chatdata")
|
||||
data: Event data dictionary (will be included in event)
|
||||
event_category: Category of event for filtering ("chat", "workflow", "document", etc.)
|
||||
message: Optional event message (for backward compatibility)
|
||||
step: Optional processing step (for backward compatibility)
|
||||
"""
|
||||
queue = self.get_queue(workflow_id)
|
||||
queue = self.get_queue(context_id)
|
||||
if not queue:
|
||||
logger.debug(f"No event queue found for workflow {workflow_id}, skipping event")
|
||||
logger.debug(f"No event queue found for context {context_id}, skipping event")
|
||||
return
|
||||
|
||||
event = {
|
||||
"type": event_type,
|
||||
"message": message,
|
||||
"category": event_category,
|
||||
"timestamp": datetime.now().timestamp(),
|
||||
"step": step,
|
||||
"data": data or {}
|
||||
"data": data,
|
||||
"message": message, # For backward compatibility
|
||||
"step": step # For backward compatibility
|
||||
}
|
||||
|
||||
try:
|
||||
await queue.put(event)
|
||||
logger.debug(f"Emitted {event_type} event for workflow {workflow_id}: {message[:50]}")
|
||||
logger.debug(f"Emitted {event_type} event (category: {event_category}) for context {context_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error emitting event for workflow {workflow_id}: {e}")
|
||||
logger.error(f"Error emitting event for context {context_id}: {e}")
|
||||
|
||||
async def cleanup(self, workflow_id: str, delay: float = 60.0):
|
||||
async def stream_events(
|
||||
self,
|
||||
context_id: str,
|
||||
event_categories: Optional[List[str]] = None,
|
||||
timeout: Optional[float] = None
|
||||
) -> AsyncIterator[Dict[str, Any]]:
|
||||
"""
|
||||
Async generator for streaming events from a context.
|
||||
|
||||
Args:
|
||||
context_id: Context ID to stream events from
|
||||
event_categories: Optional list of event categories to filter by
|
||||
timeout: Optional timeout in seconds (None = no timeout)
|
||||
|
||||
Yields:
|
||||
Event dictionaries
|
||||
"""
|
||||
queue = self.get_queue(context_id)
|
||||
if not queue:
|
||||
logger.warning(f"No queue found for context {context_id}")
|
||||
return
|
||||
|
||||
start_time = asyncio.get_event_loop().time() if timeout else None
|
||||
|
||||
while True:
|
||||
# Check timeout
|
||||
if timeout and start_time:
|
||||
elapsed = asyncio.get_event_loop().time() - start_time
|
||||
if elapsed > timeout:
|
||||
logger.debug(f"Stream timeout for context {context_id}")
|
||||
break
|
||||
|
||||
try:
|
||||
# Wait for event with timeout
|
||||
wait_timeout = 1.0 # Check timeout every second
|
||||
if timeout and start_time:
|
||||
remaining = timeout - (asyncio.get_event_loop().time() - start_time)
|
||||
if remaining <= 0:
|
||||
break
|
||||
wait_timeout = min(wait_timeout, remaining)
|
||||
|
||||
event = await asyncio.wait_for(queue.get(), timeout=wait_timeout)
|
||||
|
||||
# Filter by category if specified
|
||||
if event_categories and event.get("category") not in event_categories:
|
||||
continue
|
||||
|
||||
yield event
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
# Check if we should continue or timeout
|
||||
if timeout and start_time:
|
||||
elapsed = asyncio.get_event_loop().time() - start_time
|
||||
if elapsed >= timeout:
|
||||
break
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error(f"Error in stream_events for context {context_id}: {e}")
|
||||
break
|
||||
|
||||
async def cleanup(self, context_id: str, delay: float = 60.0):
|
||||
"""
|
||||
Schedule cleanup of event queue after delay.
|
||||
This allows time for any remaining events to be consumed.
|
||||
|
||||
Args:
|
||||
workflow_id: Workflow ID
|
||||
context_id: Context ID
|
||||
delay: Delay in seconds before cleanup (default: 60 seconds)
|
||||
"""
|
||||
if workflow_id in self._cleanup_tasks:
|
||||
if context_id in self._cleanup_tasks:
|
||||
# Cancel existing cleanup task
|
||||
self._cleanup_tasks[workflow_id].cancel()
|
||||
self._cleanup_tasks[context_id].cancel()
|
||||
|
||||
async def _cleanup():
|
||||
try:
|
||||
await asyncio.sleep(delay)
|
||||
if workflow_id in self._queues:
|
||||
if context_id in self._queues:
|
||||
# Drain remaining events
|
||||
queue = self._queues[workflow_id]
|
||||
queue = self._queues[context_id]
|
||||
while not queue.empty():
|
||||
try:
|
||||
queue.get_nowait()
|
||||
except asyncio.QueueEmpty:
|
||||
break
|
||||
|
||||
del self._queues[workflow_id]
|
||||
del self._locks[workflow_id]
|
||||
logger.info(f"Cleaned up event queue for workflow {workflow_id}")
|
||||
del self._queues[context_id]
|
||||
if context_id in self._locks:
|
||||
del self._locks[context_id]
|
||||
if context_id in self._subscribers:
|
||||
del self._subscribers[context_id]
|
||||
logger.info(f"Cleaned up event queue for context {context_id}")
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error(f"Error during cleanup for workflow {workflow_id}: {e}")
|
||||
logger.error(f"Error during cleanup for context {context_id}: {e}")
|
||||
finally:
|
||||
if workflow_id in self._cleanup_tasks:
|
||||
del self._cleanup_tasks[workflow_id]
|
||||
if context_id in self._cleanup_tasks:
|
||||
del self._cleanup_tasks[context_id]
|
||||
|
||||
self._cleanup_tasks[workflow_id] = asyncio.create_task(_cleanup())
|
||||
self._cleanup_tasks[context_id] = asyncio.create_task(_cleanup())
|
||||
|
||||
def has_queue(self, workflow_id: str) -> bool:
|
||||
def has_queue(self, context_id: str) -> bool:
|
||||
"""
|
||||
Check if a queue exists for a workflow.
|
||||
Check if a queue exists for a context.
|
||||
|
||||
Args:
|
||||
workflow_id: Workflow ID
|
||||
context_id: Context ID
|
||||
|
||||
Returns:
|
||||
True if queue exists, False otherwise
|
||||
"""
|
||||
return workflow_id in self._queues
|
||||
return context_id in self._queues
|
||||
|
||||
|
||||
# Backward compatibility: ChatbotEventManager is an alias
|
||||
ChatbotEventManager = StreamingEventManager
|
||||
|
||||
# Global singleton instance
|
||||
_event_manager = ChatbotEventManager()
|
||||
_event_manager = StreamingEventManager()
|
||||
|
||||
|
||||
def get_event_manager() -> ChatbotEventManager:
|
||||
def get_event_manager() -> StreamingEventManager:
|
||||
"""Get the global event manager instance."""
|
||||
return _event_manager
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1059,6 +1059,26 @@ class ChatObjects:
|
|||
actionName=createdMessage.get("actionName")
|
||||
)
|
||||
|
||||
# Emit message event for streaming (if event manager is available)
|
||||
try:
|
||||
from modules.features.chatbot.eventManager import get_event_manager
|
||||
event_manager = get_event_manager()
|
||||
message_timestamp = parseTimestamp(chat_message.publishedAt, default=getUtcTimestamp())
|
||||
# Emit message event in exact chatData format: {type, createdAt, item}
|
||||
asyncio.create_task(event_manager.emit_event(
|
||||
context_id=workflowId,
|
||||
event_type="chatdata",
|
||||
data={
|
||||
"type": "message",
|
||||
"createdAt": message_timestamp,
|
||||
"item": chat_message.dict()
|
||||
},
|
||||
event_category="chat"
|
||||
))
|
||||
except Exception as e:
|
||||
# Event manager not available or error - continue without emitting
|
||||
logger.debug(f"Could not emit message event: {e}")
|
||||
|
||||
# Debug: Store message and documents for debugging - only if debug enabled
|
||||
storeDebugMessageAndDocuments(chat_message, self.currentUser)
|
||||
|
||||
|
|
|
|||
|
|
@ -80,43 +80,64 @@ async def stream_chatbot_start(
|
|||
queue = event_manager.create_queue(workflow.id)
|
||||
|
||||
async def event_stream():
|
||||
"""Async generator for SSE events."""
|
||||
"""Async generator for SSE events - pure event-driven streaming (no polling)."""
|
||||
try:
|
||||
# Get interface for status checks and chat data
|
||||
# Get interface for initial data and status checks
|
||||
interfaceDbChat = getServiceChat(currentUser)
|
||||
|
||||
# Send initial chat data (exact format as chatData endpoint)
|
||||
# Get current workflow to check if resuming and get current round
|
||||
current_workflow = interfaceDbChat.getWorkflow(workflow.id)
|
||||
current_round = current_workflow.currentRound if current_workflow else None
|
||||
is_resuming = final_workflow_id is not None and current_round and current_round > 1
|
||||
|
||||
# Send initial chat data (exact format as chatData endpoint) - only once at start
|
||||
try:
|
||||
chatData = interfaceDbChat.getUnifiedChatData(workflow.id, None)
|
||||
if chatData.get("items"):
|
||||
# Filter items by round number if resuming
|
||||
filtered_items = []
|
||||
for item in chatData["items"]:
|
||||
if is_resuming and current_round:
|
||||
# Get round number from item
|
||||
item_round = None
|
||||
item_data = item.get("item")
|
||||
if item_data:
|
||||
# Handle both dict and object access
|
||||
if isinstance(item_data, dict):
|
||||
item_round = item_data.get("roundNumber")
|
||||
elif hasattr(item_data, "roundNumber"):
|
||||
item_round = item_data.roundNumber
|
||||
|
||||
# When resuming, only include items from current round onwards
|
||||
# Exclude items without roundNumber (they're from old rounds before roundNumber was added)
|
||||
# Exclude items with roundNumber < current_round (from previous rounds)
|
||||
if item_round is None or item_round < current_round:
|
||||
continue # Skip items from previous rounds or without round info
|
||||
|
||||
filtered_items.append(item)
|
||||
|
||||
# Emit filtered items
|
||||
for item in filtered_items:
|
||||
# Convert Pydantic models to dicts for JSON serialization
|
||||
serializable_item = {
|
||||
"type": item.get("type"),
|
||||
"createdAt": item.get("createdAt"),
|
||||
"item": item.get("item").dict() if hasattr(item.get("item"), "dict") else item.get("item")
|
||||
}
|
||||
# Emit item directly in exact chatData format: {type, createdAt, item}
|
||||
yield f"data: {json.dumps(item)}\n\n"
|
||||
# Set initial timestamp for incremental fetching
|
||||
if chatData["items"]:
|
||||
timestamps = [parseTimestamp(item.get("createdAt"), default=0) for item in chatData["items"]]
|
||||
last_chatdata_timestamp = max(timestamps) if timestamps else None
|
||||
else:
|
||||
last_chatdata_timestamp = None
|
||||
else:
|
||||
last_chatdata_timestamp = None
|
||||
yield f"data: {json.dumps(serializable_item)}\n\n"
|
||||
except Exception as e:
|
||||
logger.warning(f"Error fetching initial chat data: {e}")
|
||||
last_chatdata_timestamp = None
|
||||
|
||||
# Keepalive interval (30 seconds)
|
||||
keepalive_interval = 30.0
|
||||
last_keepalive = asyncio.get_event_loop().time()
|
||||
|
||||
# Status check interval (check workflow status every 3 seconds)
|
||||
status_check_interval = 3.0
|
||||
# Status check interval (check workflow status every 5 seconds - less frequent since we're event-driven)
|
||||
status_check_interval = 5.0
|
||||
last_status_check = asyncio.get_event_loop().time()
|
||||
|
||||
# Chat data fetch interval (fetch chat data every 0.5 seconds for real-time updates)
|
||||
chatdata_fetch_interval = 0.5
|
||||
last_chatdata_fetch = asyncio.get_event_loop().time()
|
||||
|
||||
# Stream events until completion or timeout
|
||||
# Stream events until completion or timeout - pure event-driven (no polling)
|
||||
timeout = 300.0 # 5 minutes max
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
|
||||
|
|
@ -124,7 +145,6 @@ async def stream_chatbot_start(
|
|||
# Check timeout
|
||||
elapsed = asyncio.get_event_loop().time() - start_time
|
||||
if elapsed > timeout:
|
||||
# Timeout - just close stream, don't emit non-chatData format events
|
||||
logger.info(f"Stream timeout for workflow {workflow.id}")
|
||||
break
|
||||
|
||||
|
|
@ -135,69 +155,51 @@ async def stream_chatbot_start(
|
|||
|
||||
current_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Periodically check workflow status and fetch chat data
|
||||
# Periodically check workflow status (less frequent since we're event-driven)
|
||||
if current_time - last_status_check >= status_check_interval:
|
||||
try:
|
||||
current_workflow = interfaceDbChat.getWorkflow(workflow.id)
|
||||
if current_workflow and current_workflow.status == "stopped":
|
||||
logger.info(f"Workflow {workflow.id} was stopped, closing stream")
|
||||
# Don't emit stopped event - just close stream
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking workflow status: {e}")
|
||||
last_status_check = current_time
|
||||
|
||||
# Periodically fetch and emit chat data
|
||||
if current_time - last_chatdata_fetch >= chatdata_fetch_interval:
|
||||
try:
|
||||
chatData = interfaceDbChat.getUnifiedChatData(workflow.id, last_chatdata_timestamp)
|
||||
if chatData.get("items"):
|
||||
# Emit items directly in exact chatData format: {type, createdAt, item}
|
||||
for item in chatData["items"]:
|
||||
yield f"data: {json.dumps(item)}\n\n"
|
||||
# Update timestamp to only get new items next time
|
||||
if chatData["items"]:
|
||||
# Parse timestamps and get the maximum
|
||||
timestamps = []
|
||||
for item in chatData["items"]:
|
||||
ts = parseTimestamp(item.get("createdAt"), default=0)
|
||||
timestamps.append(ts)
|
||||
if timestamps:
|
||||
last_chatdata_timestamp = max(timestamps)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error fetching chat data: {e}")
|
||||
last_chatdata_fetch = current_time
|
||||
|
||||
# Try to get event with timeout
|
||||
# Get event from queue (pure event-driven - no polling database)
|
||||
try:
|
||||
event = await asyncio.wait_for(queue.get(), timeout=1.0)
|
||||
|
||||
# Only emit chatdata events (messages, logs, stats) in exact chatData format
|
||||
# Ignore status/progress/complete/stopped/error events that don't match the format
|
||||
if event.get("type") == "chatdata" and event.get("data"):
|
||||
# Emit item directly in exact chatData format: {type, createdAt, item}
|
||||
chatdata_item = event.get("data")
|
||||
yield f"data: {json.dumps(chatdata_item)}\n\n"
|
||||
# Update timestamp for incremental fetching
|
||||
if chatdata_item.get("createdAt"):
|
||||
last_chatdata_timestamp = parseTimestamp(chatdata_item["createdAt"], default=None)
|
||||
# Handle different event types
|
||||
event_type = event.get("type")
|
||||
event_data = event.get("data", {})
|
||||
|
||||
# Check if this is a completion/stopped event to close stream
|
||||
if event.get("type") == "complete":
|
||||
# Emit chatdata events (messages, logs, stats) in exact chatData format
|
||||
if event_type == "chatdata" and event_data:
|
||||
# Emit item directly in exact chatData format: {type, createdAt, item}
|
||||
chatdata_item = event_data
|
||||
# Ensure item field is serializable (convert Pydantic models to dicts)
|
||||
if isinstance(chatdata_item, dict) and "item" in chatdata_item:
|
||||
item_obj = chatdata_item.get("item")
|
||||
if hasattr(item_obj, "dict"):
|
||||
chatdata_item = chatdata_item.copy()
|
||||
chatdata_item["item"] = item_obj.dict()
|
||||
yield f"data: {json.dumps(chatdata_item)}\n\n"
|
||||
|
||||
# Handle completion/stopped events to close stream
|
||||
elif event_type == "complete":
|
||||
logger.info(f"Workflow {workflow.id} completed, closing stream")
|
||||
break
|
||||
elif event.get("type") == "stopped":
|
||||
# Workflow was stopped, close stream
|
||||
elif event_type == "stopped":
|
||||
logger.info(f"Workflow {workflow.id} stopped, closing stream")
|
||||
break
|
||||
elif event.get("type") == "error" and event.get("step") == "error":
|
||||
# Final error, close stream
|
||||
elif event_type == "error" and event.get("step") == "error":
|
||||
logger.warning(f"Workflow {workflow.id} error, closing stream")
|
||||
break
|
||||
|
||||
last_keepalive = asyncio.get_event_loop().time()
|
||||
last_keepalive = current_time
|
||||
except asyncio.TimeoutError:
|
||||
# Send keepalive if needed
|
||||
# Send keepalive if needed (no events received, but keep connection alive)
|
||||
current_time = asyncio.get_event_loop().time()
|
||||
if current_time - last_keepalive >= keepalive_interval:
|
||||
yield f": keepalive\n\n"
|
||||
|
|
@ -205,14 +207,12 @@ async def stream_chatbot_start(
|
|||
continue
|
||||
except Exception as e:
|
||||
logger.error(f"Error in event stream: {e}")
|
||||
yield f"data: {json.dumps({'type': 'error', 'message': f'Stream error: {str(e)}'})}\n\n"
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in event stream generator: {e}", exc_info=True)
|
||||
# Don't emit error events that don't match chatData format
|
||||
finally:
|
||||
# Stream ends - no final event needed as it doesn't match chatData format
|
||||
# Stream ends - cleanup handled by event manager
|
||||
pass
|
||||
|
||||
return StreamingResponse(
|
||||
|
|
@ -248,10 +248,12 @@ async def stop_chatbot(
|
|||
# Emit stopped event to active streams
|
||||
event_manager = get_event_manager()
|
||||
await event_manager.emit_event(
|
||||
workflowId,
|
||||
"stopped",
|
||||
"Workflow stopped by user",
|
||||
"stopped"
|
||||
context_id=workflowId,
|
||||
event_type="stopped",
|
||||
data={"workflowId": workflowId},
|
||||
event_category="workflow",
|
||||
message="Workflow stopped by user",
|
||||
step="stopped"
|
||||
)
|
||||
logger.info(f"Emitted stopped event for workflow {workflowId}")
|
||||
|
||||
|
|
|
|||
|
|
@ -371,15 +371,36 @@ Respond with ONLY a JSON object in this exact format:
|
|||
operationType=opType.value
|
||||
)
|
||||
|
||||
self.services.chat.storeWorkflowStat(
|
||||
self.services.workflow,
|
||||
response,
|
||||
f"ai.{opType.name.lower()}"
|
||||
)
|
||||
# Try to store workflow stats, but don't fail if workflow is None (e.g., in chatbot context)
|
||||
try:
|
||||
self.services.chat.storeWorkflowStat(
|
||||
self.services.workflow,
|
||||
response,
|
||||
f"ai.{opType.name.lower()}"
|
||||
)
|
||||
except Exception as e:
|
||||
# Log but don't fail - workflow might be None in some contexts (e.g., chatbot)
|
||||
logger.debug(f"Could not store workflow stat (workflow may be None): {str(e)}")
|
||||
|
||||
self.services.chat.progressLogUpdate(aiOperationId, 0.9, f"{opType.name} completed")
|
||||
self.services.chat.progressLogFinish(aiOperationId, True)
|
||||
|
||||
# Preserve metadata from response if available (e.g., results_with_content from Tavily)
|
||||
# Check if response has metadata attribute (AiCallResponse from callAi)
|
||||
if hasattr(response, 'metadata') and response.metadata:
|
||||
# If metadata is a dict, store it in additionalData
|
||||
if isinstance(response.metadata, dict):
|
||||
if not metadata.additionalData:
|
||||
metadata.additionalData = {}
|
||||
metadata.additionalData.update(response.metadata)
|
||||
# If metadata is an object with attributes, extract them
|
||||
elif hasattr(response.metadata, '__dict__'):
|
||||
if not metadata.additionalData:
|
||||
metadata.additionalData = {}
|
||||
for key, value in response.metadata.__dict__.items():
|
||||
if not key.startswith('_'):
|
||||
metadata.additionalData[key] = value
|
||||
|
||||
return AiResponse(
|
||||
content=response.content,
|
||||
metadata=metadata
|
||||
|
|
|
|||
|
|
@ -85,22 +85,119 @@ class WebService:
|
|||
if extractedUrls:
|
||||
allUrls.extend(extractedUrls)
|
||||
|
||||
# Step 2: Search for URLs if needed (based on needsSearch flag)
|
||||
# Step 2: Search for URLs and content if needed (based on needsSearch flag)
|
||||
searchUrls = []
|
||||
searchResultsWithContent = []
|
||||
if needsSearch and (not allUrls or len(allUrls) < maxNumberPages):
|
||||
self.services.chat.progressLogUpdate(operationId, 0.3, "Searching for URLs")
|
||||
self.services.chat.progressLogUpdate(operationId, 0.3, "Searching for URLs and content")
|
||||
|
||||
searchUrls = await self._performWebSearch(
|
||||
instruction=instruction,
|
||||
maxNumberPages=maxNumberPages - len(allUrls),
|
||||
country=countryCode,
|
||||
language=languageCode
|
||||
)
|
||||
try:
|
||||
searchUrls, searchResultsWithContent = await self._performWebSearch(
|
||||
instruction=instruction,
|
||||
maxNumberPages=maxNumberPages - len(allUrls),
|
||||
country=countryCode,
|
||||
language=languageCode
|
||||
)
|
||||
logger.info(f"Tavily search returned {len(searchUrls)} URLs with {len(searchResultsWithContent)} results containing content")
|
||||
except Exception as e:
|
||||
logger.error(f"Error performing Tavily search (continuing with other URLs): {str(e)}", exc_info=True)
|
||||
searchUrls = []
|
||||
searchResultsWithContent = []
|
||||
|
||||
# Add search URLs to the list
|
||||
allUrls.extend(searchUrls)
|
||||
# Prioritize Tavily search URLs over AI-extracted URLs (they're more relevant)
|
||||
if searchUrls:
|
||||
# Prepend Tavily URLs to the list (they're more relevant)
|
||||
allUrls = searchUrls + allUrls
|
||||
logger.info(f"Using {len(searchUrls)} Tavily URLs + {len(allUrls) - len(searchUrls)} other URLs = {len(allUrls)} total")
|
||||
else:
|
||||
# If Tavily search failed, use AI-extracted URLs
|
||||
logger.warning("Tavily search returned no URLs, using AI-extracted URLs only")
|
||||
|
||||
self.services.chat.progressLogUpdate(operationId, 0.5, f"Found {len(allUrls)} total URLs")
|
||||
|
||||
# If we have search results (even without content), use them directly instead of crawling
|
||||
# Tavily search results are more relevant than generic AI-extracted URLs
|
||||
# Only crawl if we have NO search results at all
|
||||
if searchResultsWithContent and len(searchResultsWithContent) > 0:
|
||||
urls_with_actual_content = sum(1 for r in searchResultsWithContent if r.get("content") and len(r.get("content", "")) > 0)
|
||||
logger.info(f"Using {len(searchResultsWithContent)} Tavily search results ({urls_with_actual_content} with content) directly (skipping crawl)")
|
||||
|
||||
# Convert search results to crawl result format
|
||||
crawlResult = []
|
||||
for result in searchResultsWithContent:
|
||||
crawlResult.append({
|
||||
"url": result["url"],
|
||||
"title": result.get("title", ""),
|
||||
"content": result.get("content", "")
|
||||
})
|
||||
|
||||
# Calculate statistics
|
||||
totalResults = len(crawlResult)
|
||||
totalContentLength = sum(len(r.get("content", "")) for r in crawlResult)
|
||||
urlsWithContent = sum(1 for r in crawlResult if r.get("content") and len(r.get("content", "")) > 0)
|
||||
|
||||
# Log content availability
|
||||
if urlsWithContent == 0:
|
||||
logger.warning(f"Tavily search returned {len(searchResultsWithContent)} results but none have content - URLs will be used but may need crawling")
|
||||
else:
|
||||
logger.info(f"Tavily search provided content for {urlsWithContent}/{len(searchResultsWithContent)} URLs")
|
||||
|
||||
# Even if content is empty, use these results - they're more relevant than generic URLs
|
||||
# The final answer generation can work with URLs even if content is empty
|
||||
|
||||
# Convert to sections format
|
||||
sections = []
|
||||
for idx, item in enumerate(crawlResult):
|
||||
section = {
|
||||
"id": f"result_{idx}",
|
||||
"content_type": "paragraph",
|
||||
"title": item.get("title") or item.get("url", f"Result {idx + 1}"),
|
||||
"order": idx
|
||||
}
|
||||
content = item.get("content", "")
|
||||
if content:
|
||||
section["textPreview"] = content[:200] + ("..." if len(content) > 200 else "")
|
||||
sections.append(section)
|
||||
|
||||
# Return consolidated result
|
||||
result = {
|
||||
"metadata": {
|
||||
"title": suggestedFilename or instruction[:100] if instruction else "Web Research Results",
|
||||
"extraction_method": "tavily_search_direct",
|
||||
"research_depth": finalResearchDepth,
|
||||
"country": countryCode,
|
||||
"language": languageCode,
|
||||
"urls_searched": searchUrls[:20],
|
||||
"total_urls": len(searchUrls),
|
||||
"urls_with_content": urlsWithContent,
|
||||
"total_content_length": totalContentLength,
|
||||
"search_date": self.services.utils.timestampGetUtc() if hasattr(self.services, 'utils') else None
|
||||
},
|
||||
"sections": sections,
|
||||
"statistics": {
|
||||
"sectionCount": len(sections),
|
||||
"total_urls": len(searchUrls),
|
||||
"results_count": totalResults,
|
||||
"urls_with_content": urlsWithContent,
|
||||
"total_content_length": totalContentLength
|
||||
},
|
||||
"instruction": instruction,
|
||||
"urls_crawled": searchUrls,
|
||||
"total_urls": len(searchUrls),
|
||||
"results": crawlResult,
|
||||
"total_results": totalResults
|
||||
}
|
||||
|
||||
if suggestedFilename:
|
||||
result["suggested_filename"] = suggestedFilename
|
||||
result["metadata"]["suggested_filename"] = suggestedFilename
|
||||
|
||||
if operationId:
|
||||
self.services.chat.progressLogUpdate(operationId, 0.9, "Completed")
|
||||
self.services.chat.progressLogFinish(operationId, True)
|
||||
|
||||
return result
|
||||
|
||||
# Step 3: Validate and filter URLs before crawling
|
||||
validatedUrls = self._validateUrls(allUrls)
|
||||
if not validatedUrls:
|
||||
|
|
@ -328,8 +425,16 @@ Return ONLY valid JSON, no additional text:
|
|||
maxNumberPages: int,
|
||||
country: Optional[str],
|
||||
language: Optional[str]
|
||||
) -> List[str]:
|
||||
"""Perform web search to find URLs."""
|
||||
) -> tuple[List[str], List[Dict[str, Any]]]:
|
||||
"""
|
||||
Perform web search to find URLs and content.
|
||||
|
||||
Returns:
|
||||
Tuple of (urls, search_results_with_content)
|
||||
- urls: List of URL strings
|
||||
- search_results_with_content: List of dicts with url, title, content from Tavily search
|
||||
"""
|
||||
search_results_with_content = []
|
||||
try:
|
||||
# Build search prompt model
|
||||
searchPromptModel = AiCallPromptWebSearch(
|
||||
|
|
@ -356,46 +461,224 @@ Return ONLY valid JSON, no additional text:
|
|||
outputFormat="json"
|
||||
)
|
||||
|
||||
# Check if metadata contains results with content (from Tavily)
|
||||
if hasattr(searchResponse, 'metadata') and searchResponse.metadata:
|
||||
# Check in additionalData first (where we store custom metadata)
|
||||
additional_data = None
|
||||
if hasattr(searchResponse.metadata, 'additionalData') and searchResponse.metadata.additionalData:
|
||||
additional_data = searchResponse.metadata.additionalData
|
||||
elif isinstance(searchResponse.metadata, dict):
|
||||
additional_data = searchResponse.metadata.get("additionalData", {})
|
||||
|
||||
if additional_data:
|
||||
results_with_content = additional_data.get("results_with_content", [])
|
||||
if results_with_content:
|
||||
logger.info(f"Found {len(results_with_content)} search results with content in metadata.additionalData")
|
||||
# Extract URLs and content from metadata
|
||||
for result in results_with_content:
|
||||
if result.get("url"):
|
||||
search_results_with_content.append({
|
||||
"url": result.get("url"),
|
||||
"title": result.get("title", ""),
|
||||
"content": result.get("content", ""),
|
||||
"score": result.get("score", 0)
|
||||
})
|
||||
|
||||
# Also check directly in metadata (fallback)
|
||||
if not search_results_with_content:
|
||||
results_with_content = None
|
||||
if hasattr(searchResponse.metadata, 'results_with_content'):
|
||||
results_with_content = searchResponse.metadata.results_with_content
|
||||
elif isinstance(searchResponse.metadata, dict):
|
||||
results_with_content = searchResponse.metadata.get("results_with_content", [])
|
||||
|
||||
if results_with_content:
|
||||
logger.info(f"Found {len(results_with_content)} search results with content in metadata (direct)")
|
||||
for result in results_with_content:
|
||||
if result.get("url"):
|
||||
search_results_with_content.append({
|
||||
"url": result.get("url"),
|
||||
"title": result.get("title", ""),
|
||||
"content": result.get("content", ""),
|
||||
"score": result.get("score", 0)
|
||||
})
|
||||
|
||||
# Extract content from AiResponse
|
||||
searchResult = searchResponse.content
|
||||
|
||||
logger.debug(f"Search response content type: {type(searchResult)}, length: {len(str(searchResult)) if searchResult else 0}")
|
||||
|
||||
# Debug: persist search response
|
||||
if isinstance(searchResult, str):
|
||||
self.services.utils.writeDebugFile(searchResult, "websearch_response")
|
||||
logger.debug(f"Search response (first 500 chars): {searchResult[:500]}")
|
||||
else:
|
||||
self.services.utils.writeDebugFile(json.dumps(searchResult, indent=2), "websearch_response")
|
||||
logger.debug(f"Search response type: {type(searchResult)}, keys: {list(searchResult.keys()) if isinstance(searchResult, dict) else 'N/A'}")
|
||||
|
||||
# Parse and extract URLs
|
||||
# Parse and extract URLs and content
|
||||
if isinstance(searchResult, str):
|
||||
# Extract JSON from response (handles markdown code blocks)
|
||||
extractedJson = self.services.utils.jsonExtractString(searchResult)
|
||||
searchData = json.loads(extractedJson) if extractedJson else json.loads(searchResult)
|
||||
if extractedJson:
|
||||
try:
|
||||
searchData = json.loads(extractedJson)
|
||||
logger.debug(f"Parsed JSON from extracted string, type: {type(searchData)}")
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse extracted JSON: {e}, trying direct parse")
|
||||
searchData = json.loads(searchResult)
|
||||
else:
|
||||
try:
|
||||
searchData = json.loads(searchResult)
|
||||
logger.debug(f"Parsed JSON directly from string, type: {type(searchData)}")
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse search result as JSON: {e}")
|
||||
searchData = {}
|
||||
else:
|
||||
searchData = searchResult
|
||||
logger.debug(f"Using searchResult directly as searchData, type: {type(searchData)}")
|
||||
|
||||
# Extract URLs from response
|
||||
logger.debug(f"Final searchData type: {type(searchData)}, is dict: {isinstance(searchData, dict)}, keys: {list(searchData.keys()) if isinstance(searchData, dict) else 'N/A'}")
|
||||
|
||||
# Extract URLs and content from response
|
||||
urls = []
|
||||
if isinstance(searchData, dict):
|
||||
if "urls" in searchData:
|
||||
# Check for new format: {"urls": [...], "results": [...]}
|
||||
if "urls" in searchData and "results" in searchData:
|
||||
urls = searchData["urls"]
|
||||
# Extract results with content
|
||||
for r in searchData["results"]:
|
||||
if r.get("url"):
|
||||
# Only add if not already added from metadata
|
||||
if not any(sr["url"] == r.get("url") for sr in search_results_with_content):
|
||||
search_results_with_content.append({
|
||||
"url": r.get("url"),
|
||||
"title": r.get("title", ""),
|
||||
"content": r.get("content", ""),
|
||||
"score": r.get("score", 0)
|
||||
})
|
||||
logger.debug(f"Extracted {len(urls)} URLs and {len(search_results_with_content)} results with content from new format")
|
||||
elif "urls" in searchData:
|
||||
urls = searchData["urls"]
|
||||
logger.debug(f"Extracted {len(urls)} URLs from 'urls' field")
|
||||
elif "results" in searchData:
|
||||
urls = [r.get("url") for r in searchData["results"] if r.get("url")]
|
||||
# Extract URLs from results (content already in search_results_with_content if from metadata)
|
||||
for r in searchData["results"]:
|
||||
if r.get("url"):
|
||||
urls.append(r.get("url"))
|
||||
# Only add to search_results_with_content if not already added from metadata
|
||||
if not any(sr["url"] == r.get("url") for sr in search_results_with_content):
|
||||
search_results_with_content.append({
|
||||
"url": r.get("url"),
|
||||
"title": r.get("title", ""),
|
||||
"content": r.get("raw_content") or r.get("content", ""),
|
||||
"score": r.get("score", 0)
|
||||
})
|
||||
logger.debug(f"Extracted {len(urls)} URLs with content from 'results' field")
|
||||
else:
|
||||
logger.warning(f"Unexpected search data structure (dict): {list(searchData.keys())}")
|
||||
elif isinstance(searchData, list):
|
||||
# Handle both cases: list of URL strings or list of dicts with "url" key
|
||||
for item in searchData:
|
||||
if isinstance(item, str):
|
||||
# Item is already a URL string
|
||||
urls.append(item)
|
||||
elif isinstance(item, dict) and item.get("url"):
|
||||
# Item is a dict with "url" key
|
||||
urls.append(item.get("url"))
|
||||
elif isinstance(item, dict):
|
||||
if item.get("url"):
|
||||
urls.append(item.get("url"))
|
||||
# Only add to search_results_with_content if not already added from metadata
|
||||
if not any(sr["url"] == item.get("url") for sr in search_results_with_content):
|
||||
search_results_with_content.append({
|
||||
"url": item.get("url"),
|
||||
"title": item.get("title", ""),
|
||||
"content": item.get("raw_content") or item.get("content", ""),
|
||||
"score": item.get("score", 0)
|
||||
})
|
||||
logger.debug(f"Extracted {len(urls)} URLs from list")
|
||||
else:
|
||||
logger.warning(f"Unexpected search data type: {type(searchData)}")
|
||||
|
||||
logger.info(f"Web search returned {len(urls)} URLs")
|
||||
return urls
|
||||
# If we got URLs but no content from metadata, extract URLs from search_results_with_content
|
||||
if urls and not search_results_with_content:
|
||||
# URLs were extracted but no content - create entries with empty content
|
||||
for url in urls:
|
||||
search_results_with_content.append({
|
||||
"url": url,
|
||||
"title": "",
|
||||
"content": "",
|
||||
"score": 0
|
||||
})
|
||||
elif search_results_with_content and not urls:
|
||||
# We have content but no URLs - extract URLs from content results
|
||||
urls = [r["url"] for r in search_results_with_content]
|
||||
|
||||
# If we have URLs but no search_results_with_content, create entries from URLs
|
||||
# This ensures we can use Tavily URLs even if content extraction failed
|
||||
if urls and not search_results_with_content:
|
||||
logger.warning("Got URLs from Tavily search but no content extracted - creating entries for direct use")
|
||||
for url in urls:
|
||||
search_results_with_content.append({
|
||||
"url": url,
|
||||
"title": "",
|
||||
"content": "", # Empty content - will need crawling if used
|
||||
"score": 0
|
||||
})
|
||||
|
||||
logger.info(f"Web search returned {len(urls)} URLs with {len(search_results_with_content)} results")
|
||||
if search_results_with_content:
|
||||
content_count = sum(1 for r in search_results_with_content if r.get("content") and len(r.get("content", "")) > 0)
|
||||
logger.info(f" - {content_count} results have content, {len(search_results_with_content) - content_count} without content")
|
||||
if content_count > 0:
|
||||
first_with_content = next((r for r in search_results_with_content if r.get("content")), None)
|
||||
if first_with_content:
|
||||
logger.info(f"Content preview from first result with content: {first_with_content.get('content', '')[:200]}")
|
||||
else:
|
||||
logger.warning("No search results extracted - will need to crawl URLs")
|
||||
return urls, search_results_with_content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in web search: {str(e)}")
|
||||
return []
|
||||
logger.error(f"Error in web search: {str(e)}", exc_info=True)
|
||||
# Even if there's an error, try to extract URLs from the response if available
|
||||
recovered_urls = []
|
||||
recovered_results = []
|
||||
try:
|
||||
if 'searchResponse' in locals() and searchResponse:
|
||||
logger.info(f"Attempting to extract URLs from error response: {type(searchResponse)}")
|
||||
# Try to get content from response
|
||||
if hasattr(searchResponse, 'content'):
|
||||
errorContent = searchResponse.content
|
||||
if isinstance(errorContent, str):
|
||||
# Try to parse as JSON
|
||||
try:
|
||||
errorData = json.loads(errorContent)
|
||||
if isinstance(errorData, dict):
|
||||
if "urls" in errorData:
|
||||
recovered_urls = errorData["urls"]
|
||||
elif "results" in errorData:
|
||||
recovered_urls = [r.get("url") for r in errorData["results"] if r.get("url")]
|
||||
recovered_results = [{"url": r.get("url"), "title": r.get("title", ""), "content": r.get("content", ""), "score": 0} for r in errorData["results"]]
|
||||
elif isinstance(errorData, list):
|
||||
recovered_urls = [item if isinstance(item, str) else item.get("url", "") for item in errorData if item]
|
||||
if recovered_urls:
|
||||
logger.info(f"Recovered {len(recovered_urls)} URLs from error response")
|
||||
# Create entries for recovered URLs
|
||||
if not recovered_results:
|
||||
for url in recovered_urls:
|
||||
recovered_results.append({"url": url, "title": "", "content": "", "score": 0})
|
||||
return recovered_urls, recovered_results
|
||||
except Exception as parseError:
|
||||
logger.debug(f"Failed to parse error response: {parseError}")
|
||||
except Exception as recoverError:
|
||||
logger.debug(f"Failed to recover URLs from error: {recoverError}")
|
||||
|
||||
# If we have URLs from earlier extraction, return them
|
||||
if 'urls' in locals() and urls:
|
||||
logger.info(f"Returning {len(urls)} URLs extracted before error occurred")
|
||||
# Create entries from URLs
|
||||
results_from_urls = [{"url": url, "title": "", "content": "", "score": 0} for url in urls]
|
||||
return urls, results_from_urls
|
||||
|
||||
return [], []
|
||||
|
||||
def _validateUrls(self, urls: List[str]) -> List[str]:
|
||||
"""
|
||||
|
|
|
|||
Loading…
Reference in a new issue