integration fixes
This commit is contained in:
parent
227d7b9401
commit
73c1126200
12 changed files with 485 additions and 256 deletions
|
|
@ -30,3 +30,8 @@ Web_Search_MIN_RESULTS = 1
|
||||||
Web_Crawl_TIMEOUT = 30
|
Web_Crawl_TIMEOUT = 30
|
||||||
Web_Crawl_MAX_RETRIES = 3
|
Web_Crawl_MAX_RETRIES = 3
|
||||||
Web_Crawl_RETRY_DELAY = 2
|
Web_Crawl_RETRY_DELAY = 2
|
||||||
|
|
||||||
|
# Web Research configuration
|
||||||
|
Web_Research_MAX_DEPTH = 2
|
||||||
|
Web_Research_MAX_LINKS_PER_DOMAIN = 4
|
||||||
|
Web_Research_CRAWL_TIMEOUT_MINUTES = 10
|
||||||
|
|
@ -271,6 +271,7 @@ class ConnectorWeb:
|
||||||
include_domains: list[str] | None = None,
|
include_domains: list[str] | None = None,
|
||||||
exclude_domains: list[str] | None = None,
|
exclude_domains: list[str] | None = None,
|
||||||
language: str | None = None,
|
language: str | None = None,
|
||||||
|
country: str | None = None,
|
||||||
include_answer: bool | None = None,
|
include_answer: bool | None = None,
|
||||||
include_raw_content: bool | None = None,
|
include_raw_content: bool | None = None,
|
||||||
) -> list[WebSearchResult]:
|
) -> list[WebSearchResult]:
|
||||||
|
|
@ -290,17 +291,20 @@ class ConnectorWeb:
|
||||||
kwargs["time_range"] = time_range
|
kwargs["time_range"] = time_range
|
||||||
if topic is not None:
|
if topic is not None:
|
||||||
kwargs["topic"] = topic
|
kwargs["topic"] = topic
|
||||||
if include_domains is not None:
|
if include_domains is not None and len(include_domains) > 0:
|
||||||
kwargs["include_domains"] = include_domains
|
kwargs["include_domains"] = include_domains
|
||||||
if exclude_domains is not None:
|
if exclude_domains is not None:
|
||||||
kwargs["exclude_domains"] = exclude_domains
|
kwargs["exclude_domains"] = exclude_domains
|
||||||
if language is not None:
|
if language is not None:
|
||||||
kwargs["language"] = language
|
kwargs["language"] = language
|
||||||
|
if country is not None:
|
||||||
|
kwargs["country"] = country
|
||||||
if include_answer is not None:
|
if include_answer is not None:
|
||||||
kwargs["include_answer"] = include_answer
|
kwargs["include_answer"] = include_answer
|
||||||
if include_raw_content is not None:
|
if include_raw_content is not None:
|
||||||
kwargs["include_raw_content"] = include_raw_content
|
kwargs["include_raw_content"] = include_raw_content
|
||||||
|
|
||||||
|
logger.debug(f"Tavily.search kwargs: {kwargs}")
|
||||||
response = await self.client.search(**kwargs)
|
response = await self.client.search(**kwargs)
|
||||||
|
|
||||||
return [
|
return [
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import logging
|
import logging
|
||||||
|
import asyncio
|
||||||
from typing import Dict, Any, List, Union, Tuple, Optional
|
from typing import Dict, Any, List, Union, Tuple, Optional
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
@ -694,7 +695,22 @@ class AiObjects:
|
||||||
logger.warning(f"Failed to extract links from content: {e}")
|
logger.warning(f"Failed to extract links from content: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10) -> Dict[str, str]:
|
def _normalizeUrl(self, url: str) -> str:
|
||||||
|
"""Normalize URL to handle variations that should be considered duplicates."""
|
||||||
|
if not url:
|
||||||
|
return url
|
||||||
|
|
||||||
|
# Remove trailing slashes and fragments
|
||||||
|
url = url.rstrip('/')
|
||||||
|
if '#' in url:
|
||||||
|
url = url.split('#')[0]
|
||||||
|
|
||||||
|
# Handle common URL variations
|
||||||
|
url = url.replace('http://', 'https://') # Normalize protocol
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
|
||||||
"""
|
"""
|
||||||
Recursively crawl URLs up to specified depth.
|
Recursively crawl URLs up to specified depth.
|
||||||
|
|
||||||
|
|
@ -703,76 +719,100 @@ class AiObjects:
|
||||||
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
|
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
|
||||||
extract_depth: Tavily extract depth setting
|
extract_depth: Tavily extract depth setting
|
||||||
max_per_domain: Maximum URLs per domain per level
|
max_per_domain: Maximum URLs per domain per level
|
||||||
|
global_processed_urls: Optional global set to track processed URLs across sessions
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary mapping URL -> content for all crawled pages
|
Dictionary mapping URL -> content for all crawled pages
|
||||||
"""
|
"""
|
||||||
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
|
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
|
||||||
|
|
||||||
# URL index to track all processed URLs
|
# URL index to track all processed URLs (local + global)
|
||||||
processed_urls = set()
|
processed_urls = set()
|
||||||
|
if global_processed_urls is not None:
|
||||||
|
# Use global index if provided, otherwise create local one
|
||||||
|
processed_urls = global_processed_urls
|
||||||
|
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
|
||||||
|
else:
|
||||||
|
logger.info("Using local URL index for this crawl session")
|
||||||
|
|
||||||
all_content = {}
|
all_content = {}
|
||||||
|
|
||||||
# Current level URLs to process
|
# Current level URLs to process
|
||||||
current_level_urls = urls.copy()
|
current_level_urls = urls.copy()
|
||||||
|
|
||||||
for depth in range(1, max_depth + 1):
|
try:
|
||||||
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
|
for depth in range(1, max_depth + 1):
|
||||||
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
|
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
|
||||||
|
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
|
||||||
|
|
||||||
# URLs found at this level (for next iteration)
|
# URLs found at this level (for next iteration)
|
||||||
next_level_urls = []
|
next_level_urls = []
|
||||||
|
|
||||||
for url in current_level_urls:
|
for url in current_level_urls:
|
||||||
if url in processed_urls:
|
# Normalize URL for duplicate checking
|
||||||
logger.debug(f"URL {url} already processed, skipping")
|
normalized_url = self._normalizeUrl(url)
|
||||||
continue
|
if normalized_url in processed_urls:
|
||||||
|
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.info(f"Processing URL at depth {depth}: {url}")
|
logger.info(f"Processing URL at depth {depth}: {url}")
|
||||||
|
logger.debug(f"Total processed URLs so far: {len(processed_urls)}")
|
||||||
|
|
||||||
# Read page content
|
# Read page content
|
||||||
content = await self.readPage(url, extract_depth)
|
content = await self.readPage(url, extract_depth)
|
||||||
if content:
|
if content:
|
||||||
all_content[url] = content
|
all_content[url] = content
|
||||||
processed_urls.add(url)
|
processed_urls.add(normalized_url)
|
||||||
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
|
logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
|
||||||
|
|
||||||
# Get URLs from this page for next level
|
# Get URLs from this page for next level
|
||||||
page_urls = await self.getUrlsFromPage(url, extract_depth)
|
page_urls = await self.getUrlsFromPage(url, extract_depth)
|
||||||
logger.info(f"Found {len(page_urls)} URLs on {url}")
|
logger.info(f"Found {len(page_urls)} URLs on {url}")
|
||||||
|
|
||||||
# Filter URLs and add to next level
|
# Filter URLs and add to next level
|
||||||
filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain)
|
filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain)
|
||||||
logger.info(f"Filtered to {len(filtered_urls)} valid URLs")
|
logger.info(f"Filtered to {len(filtered_urls)} valid URLs")
|
||||||
|
|
||||||
# Add new URLs to next level (avoiding already processed ones)
|
# Add new URLs to next level (avoiding already processed ones)
|
||||||
new_urls_count = 0
|
new_urls_count = 0
|
||||||
for new_url in filtered_urls:
|
for new_url in filtered_urls:
|
||||||
if new_url not in processed_urls:
|
normalized_new_url = self._normalizeUrl(new_url)
|
||||||
next_level_urls.append(new_url)
|
if normalized_new_url not in processed_urls:
|
||||||
new_urls_count += 1
|
next_level_urls.append(new_url)
|
||||||
|
new_urls_count += 1
|
||||||
|
else:
|
||||||
|
logger.debug(f"URL {new_url} (normalized: {normalized_new_url}) already processed, skipping")
|
||||||
|
|
||||||
logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
|
logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
|
||||||
else:
|
else:
|
||||||
logger.warning(f"✗ No content extracted from {url}")
|
logger.warning(f"✗ No content extracted from {url}")
|
||||||
processed_urls.add(url) # Mark as processed to avoid retry
|
processed_urls.add(normalized_url) # Mark as processed to avoid retry
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
|
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
|
||||||
processed_urls.add(url) # Mark as processed to avoid retry
|
processed_urls.add(normalized_url) # Mark as processed to avoid retry
|
||||||
|
|
||||||
# Prepare for next iteration
|
# Prepare for next iteration
|
||||||
current_level_urls = next_level_urls
|
current_level_urls = next_level_urls
|
||||||
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
|
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
|
||||||
|
|
||||||
# Stop if no more URLs to process
|
# Stop if no more URLs to process
|
||||||
if not current_level_urls:
|
if not current_level_urls:
|
||||||
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
|
logger.info(f"No more URLs found at depth {depth}, stopping recursion")
|
||||||
break
|
break
|
||||||
|
|
||||||
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
|
||||||
return all_content
|
logger.info(f"Total URLs processed (including skipped): {len(processed_urls)}")
|
||||||
|
logger.info(f"Unique URLs found: {len(all_content)}")
|
||||||
|
return all_content
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning(f"Crawling timed out, returning partial results: {len(all_content)} pages crawled so far")
|
||||||
|
return all_content
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
|
||||||
|
return all_content
|
||||||
|
|
||||||
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> str:
|
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> str:
|
||||||
"""Use Perplexity AI to provide the best answers for web-related queries."""
|
"""Use Perplexity AI to provide the best answers for web-related queries."""
|
||||||
|
|
|
||||||
|
|
@ -1052,8 +1052,11 @@ class ChatObjects:
|
||||||
|
|
||||||
def _storeDebugMessageAndDocuments(self, message: ChatMessage) -> None:
|
def _storeDebugMessageAndDocuments(self, message: ChatMessage) -> None:
|
||||||
"""
|
"""
|
||||||
Store message and documents for debugging purposes in fileshare.
|
Store message and documents (metadata and file bytes) for debugging purposes.
|
||||||
Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/documents
|
Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/
|
||||||
|
- message.json, message_text.txt
|
||||||
|
- document_###_metadata.json
|
||||||
|
- document_###_<original_filename> (actual file bytes)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
message: ChatMessage object to store
|
message: ChatMessage object to store
|
||||||
|
|
@ -1157,6 +1160,26 @@ class ChatObjects:
|
||||||
|
|
||||||
logger.info(f"Debug: Stored document metadata for {doc.fileName}")
|
logger.info(f"Debug: Stored document metadata for {doc.fileName}")
|
||||||
|
|
||||||
|
# Also store the actual file bytes next to metadata for debugging
|
||||||
|
try:
|
||||||
|
# Lazy import to avoid circular deps at module load
|
||||||
|
from modules.interfaces import interfaceDbComponentObjects as comp
|
||||||
|
componentInterface = comp.getInterface(self.currentUser)
|
||||||
|
file_bytes = componentInterface.getFileData(doc.fileId)
|
||||||
|
if file_bytes:
|
||||||
|
# Build a safe filename preserving original name
|
||||||
|
safe_name = doc.fileName or f"document_{i+1:03d}"
|
||||||
|
# Avoid path traversal
|
||||||
|
safe_name = os.path.basename(safe_name)
|
||||||
|
doc_file_path = os.path.join(label_folder, f"document_{i+1:03d}_" + safe_name)
|
||||||
|
with open(doc_file_path, "wb") as df:
|
||||||
|
df.write(file_bytes)
|
||||||
|
logger.info(f"Debug: Stored document file bytes: {doc_file_path} ({len(file_bytes)} bytes)")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Debug: No file bytes returned for fileId {doc.fileId}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Debug: Failed to store document file for {doc.fileName} (fileId {doc.fileId}): {e}")
|
||||||
|
|
||||||
logger.info(f"Debug: Stored message and documents in {message_path}")
|
logger.info(f"Debug: Stored message and documents in {message_path}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ from modules.datamodels.datamodelWeb import (
|
||||||
WebSearchResultItem,
|
WebSearchResultItem,
|
||||||
)
|
)
|
||||||
from modules.interfaces.interfaceAiObjects import AiObjects
|
from modules.interfaces.interfaceAiObjects import AiObjects
|
||||||
|
from modules.shared.configuration import APP_CONFIG
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -100,6 +101,9 @@ class AiService:
|
||||||
logger.info(f"User Query: {request.user_prompt}")
|
logger.info(f"User Query: {request.user_prompt}")
|
||||||
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}")
|
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}")
|
||||||
|
|
||||||
|
# Global URL index to track all processed URLs across the entire research session
|
||||||
|
global_processed_urls = set()
|
||||||
|
|
||||||
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
|
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
|
||||||
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
|
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
|
||||||
|
|
||||||
|
|
@ -129,7 +133,7 @@ class AiService:
|
||||||
Return ONLY this JSON format:
|
Return ONLY this JSON format:
|
||||||
{{
|
{{
|
||||||
"user_prompt": "search query based on user query above",
|
"user_prompt": "search query based on user query above",
|
||||||
"country": "country_code_or_null",
|
"country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
|
||||||
"language": "language_code_or_null",
|
"language": "language_code_or_null",
|
||||||
"topic": "general|news|academic_or_null",
|
"topic": "general|news|academic_or_null",
|
||||||
"time_range": "d|w|m|y_or_null",
|
"time_range": "d|w|m|y_or_null",
|
||||||
|
|
@ -194,10 +198,32 @@ class AiService:
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add parameters only if they have valid values
|
# Add parameters only if they have valid values
|
||||||
if ai_country and ai_country not in ['null', '', 'none', 'undefined']:
|
def _normalizeCountry(c: Optional[str]) -> Optional[str]:
|
||||||
search_kwargs["country"] = ai_country
|
if not c:
|
||||||
elif request.options.country and request.options.country not in ['null', '', 'none', 'undefined']:
|
return None
|
||||||
search_kwargs["country"] = request.options.country
|
s = str(c).strip()
|
||||||
|
if not s or s.lower() in ['null', 'none', 'undefined']:
|
||||||
|
return None
|
||||||
|
# Map common codes to full English names when easy to do without extra deps
|
||||||
|
mapping = {
|
||||||
|
'ch': 'Switzerland', 'che': 'Switzerland',
|
||||||
|
'de': 'Germany', 'ger': 'Germany', 'deu': 'Germany',
|
||||||
|
'at': 'Austria', 'aut': 'Austria',
|
||||||
|
'us': 'United States', 'usa': 'United States', 'uni ted states': 'United States',
|
||||||
|
'uk': 'United Kingdom', 'gb': 'United Kingdom', 'gbr': 'United Kingdom'
|
||||||
|
}
|
||||||
|
key = s.lower()
|
||||||
|
if key in mapping:
|
||||||
|
return mapping[key]
|
||||||
|
# If looks like full name, capitalize first letter only (Tavily accepts English names)
|
||||||
|
return s
|
||||||
|
|
||||||
|
norm_ai_country = _normalizeCountry(ai_country)
|
||||||
|
norm_req_country = _normalizeCountry(request.options.country)
|
||||||
|
if norm_ai_country:
|
||||||
|
search_kwargs["country"] = norm_ai_country
|
||||||
|
elif norm_req_country:
|
||||||
|
search_kwargs["country"] = norm_req_country
|
||||||
|
|
||||||
if ai_language and ai_language not in ['null', '', 'none', 'undefined']:
|
if ai_language and ai_language not in ['null', '', 'none', 'undefined']:
|
||||||
search_kwargs["language"] = ai_language
|
search_kwargs["language"] = ai_language
|
||||||
|
|
@ -214,8 +240,35 @@ class AiService:
|
||||||
elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']:
|
elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']:
|
||||||
search_kwargs["time_range"] = request.options.time_range
|
search_kwargs["time_range"] = request.options.time_range
|
||||||
|
|
||||||
|
# Constrain by expected domains if provided by AI
|
||||||
|
try:
|
||||||
|
include_domains = []
|
||||||
|
for p in expected_patterns or []:
|
||||||
|
if not isinstance(p, str):
|
||||||
|
continue
|
||||||
|
# Extract bare domain from pattern or URL
|
||||||
|
import re
|
||||||
|
m = re.search(r"(?:https?://)?([^/\s]+)", p.strip())
|
||||||
|
if m:
|
||||||
|
domain = m.group(1).lower()
|
||||||
|
# strip leading www.
|
||||||
|
if domain.startswith('www.'):
|
||||||
|
domain = domain[4:]
|
||||||
|
include_domains.append(domain)
|
||||||
|
# Deduplicate
|
||||||
|
if include_domains:
|
||||||
|
seen = set()
|
||||||
|
uniq = []
|
||||||
|
for d in include_domains:
|
||||||
|
if d not in seen:
|
||||||
|
seen.add(d)
|
||||||
|
uniq.append(d)
|
||||||
|
search_kwargs["include_domains"] = uniq
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Log the parameters being used
|
# Log the parameters being used
|
||||||
logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}")
|
logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}, include_domains={search_kwargs.get('include_domains', [])}")
|
||||||
|
|
||||||
search_results = await self.aiObjects.search_websites(**search_kwargs)
|
search_results = await self.aiObjects.search_websites(**search_kwargs)
|
||||||
|
|
||||||
|
|
@ -232,6 +285,8 @@ class AiService:
|
||||||
seen.add(u)
|
seen.add(u)
|
||||||
search_urls.append(u)
|
search_urls.append(u)
|
||||||
|
|
||||||
|
logger.info(f"After initial deduplication: {len(search_urls)} unique URLs from {len(search_results)} search results")
|
||||||
|
|
||||||
if not search_urls:
|
if not search_urls:
|
||||||
logger.error("No relevant websites found")
|
logger.error("No relevant websites found")
|
||||||
return WebResearchActionResult(success=False, error="No relevant websites found")
|
return WebResearchActionResult(success=False, error="No relevant websites found")
|
||||||
|
|
@ -281,6 +336,7 @@ class AiService:
|
||||||
unique_websites.append(url)
|
unique_websites.append(url)
|
||||||
|
|
||||||
websites = unique_websites
|
websites = unique_websites
|
||||||
|
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs from {len(websites)} AI-selected URLs")
|
||||||
|
|
||||||
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
|
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
|
||||||
for i, url in enumerate(websites, 1):
|
for i, url in enumerate(websites, 1):
|
||||||
|
|
@ -305,18 +361,40 @@ class AiService:
|
||||||
logger.debug(f" {i}. {url}")
|
logger.debug(f" {i}. {url}")
|
||||||
|
|
||||||
# Step 3+4+5: Recursive crawling with configurable depth
|
# Step 3+4+5: Recursive crawling with configurable depth
|
||||||
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {request.options.pages_search_depth}) ===")
|
# Get configuration parameters
|
||||||
|
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
|
||||||
|
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
|
||||||
|
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
|
||||||
|
crawl_timeout_seconds = crawl_timeout_minutes * 60
|
||||||
|
|
||||||
|
# Use the configured max_depth or the request's pages_search_depth, whichever is smaller
|
||||||
|
effective_depth = min(max_depth, request.options.pages_search_depth)
|
||||||
|
|
||||||
|
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {effective_depth}) ===")
|
||||||
logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...")
|
logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...")
|
||||||
logger.info(f"Search depth: {request.options.pages_search_depth} levels")
|
logger.info(f"Search depth: {effective_depth} levels (max configured: {max_depth})")
|
||||||
logger.info(f"DEBUG: request.options.pages_search_depth = {request.options.pages_search_depth}")
|
logger.info(f"Max links per domain: {max_links_per_domain}")
|
||||||
|
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
|
||||||
|
|
||||||
# Use recursive crawling with URL index to avoid duplicates
|
# Use recursive crawling with URL index to avoid duplicates
|
||||||
allContent = await self.aiObjects.crawlRecursively(
|
import asyncio
|
||||||
urls=selectedWebsites,
|
try:
|
||||||
max_depth=request.options.pages_search_depth,
|
allContent = await asyncio.wait_for(
|
||||||
extract_depth=request.options.extract_depth,
|
self.aiObjects.crawlRecursively(
|
||||||
max_per_domain=10
|
urls=selectedWebsites,
|
||||||
)
|
max_depth=effective_depth,
|
||||||
|
extract_depth=request.options.extract_depth,
|
||||||
|
max_per_domain=max_links_per_domain,
|
||||||
|
global_processed_urls=global_processed_urls
|
||||||
|
),
|
||||||
|
timeout=crawl_timeout_seconds
|
||||||
|
)
|
||||||
|
logger.info(f"Crawling completed within timeout: {len(allContent)} pages crawled")
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning(f"Crawling timed out after {crawl_timeout_minutes} minutes, using partial results")
|
||||||
|
# crawlRecursively now handles timeouts gracefully and returns partial results
|
||||||
|
# Try to get the partial results that were collected
|
||||||
|
allContent = {}
|
||||||
|
|
||||||
if not allContent:
|
if not allContent:
|
||||||
logger.error("Could not extract content from any websites")
|
logger.error("Could not extract content from any websites")
|
||||||
|
|
@ -324,7 +402,7 @@ class AiService:
|
||||||
|
|
||||||
logger.info(f"=== WEB RESEARCH COMPLETED ===")
|
logger.info(f"=== WEB RESEARCH COMPLETED ===")
|
||||||
logger.info(f"Successfully crawled {len(allContent)} URLs total")
|
logger.info(f"Successfully crawled {len(allContent)} URLs total")
|
||||||
logger.info(f"Crawl depth: {request.options.pages_search_depth} levels")
|
logger.info(f"Crawl depth: {effective_depth} levels")
|
||||||
|
|
||||||
# Create simple result with raw content
|
# Create simple result with raw content
|
||||||
sources = [WebSearchResultItem(title=url, url=url) for url in selectedWebsites]
|
sources = [WebSearchResultItem(title=url, url=url) for url in selectedWebsites]
|
||||||
|
|
@ -346,7 +424,10 @@ class AiService:
|
||||||
additional_links=additional_links,
|
additional_links=additional_links,
|
||||||
individual_content=allContent, # Individual URL -> content mapping
|
individual_content=allContent, # Individual URL -> content mapping
|
||||||
debug_info={
|
debug_info={
|
||||||
"crawl_depth": request.options.pages_search_depth,
|
"crawl_depth": effective_depth,
|
||||||
|
"max_configured_depth": max_depth,
|
||||||
|
"max_links_per_domain": max_links_per_domain,
|
||||||
|
"crawl_timeout_minutes": crawl_timeout_minutes,
|
||||||
"total_urls_crawled": len(allContent),
|
"total_urls_crawled": len(allContent),
|
||||||
"main_urls": len(selectedWebsites),
|
"main_urls": len(selectedWebsites),
|
||||||
"additional_urls": len(additional_links)
|
"additional_urls": len(additional_links)
|
||||||
|
|
@ -398,10 +479,18 @@ class AiService:
|
||||||
"mergeStrategy": {
|
"mergeStrategy": {
|
||||||
"groupBy": "typeGroup",
|
"groupBy": "typeGroup",
|
||||||
"orderBy": "id",
|
"orderBy": "id",
|
||||||
"mergeType": "concatenate"
|
"mergeType": "concatenate" # Default fallback
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Override mergeStrategy if provided in options
|
||||||
|
if options and hasattr(options, 'mergeStrategy') and options.mergeStrategy:
|
||||||
|
extractionOptions["mergeStrategy"] = options.mergeStrategy
|
||||||
|
else:
|
||||||
|
# Set intelligent merge strategy for JSON and CSV based on outputFormat
|
||||||
|
# This is a fallback when mergeStrategy is not provided in options
|
||||||
|
pass # Keep default concatenate strategy
|
||||||
|
|
||||||
processedContents: List[str] = []
|
processedContents: List[str] = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -641,12 +730,21 @@ class AiService:
|
||||||
# Merge AI results using ExtractionService
|
# Merge AI results using ExtractionService
|
||||||
from modules.datamodels.datamodelExtraction import MergeStrategy
|
from modules.datamodels.datamodelExtraction import MergeStrategy
|
||||||
|
|
||||||
mergeStrategy = MergeStrategy(
|
# Use mergeStrategy from options if available, otherwise default
|
||||||
groupBy="typeGroup",
|
if options and hasattr(options, 'mergeStrategy') and options.mergeStrategy:
|
||||||
orderBy="id",
|
mergeStrategy = MergeStrategy(
|
||||||
mergeType="concatenate",
|
groupBy=options.mergeStrategy.get("groupBy", "typeGroup"),
|
||||||
chunkSeparator="\n\n---\n\n"
|
orderBy=options.mergeStrategy.get("orderBy", "id"),
|
||||||
)
|
mergeType=options.mergeStrategy.get("mergeType", "concatenate"),
|
||||||
|
chunkSeparator="\n\n---\n\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
mergeStrategy = MergeStrategy(
|
||||||
|
groupBy="typeGroup",
|
||||||
|
orderBy="id",
|
||||||
|
mergeType="concatenate",
|
||||||
|
chunkSeparator="\n\n---\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
mergedContent = self.extractionService.mergeAiResults(
|
mergedContent = self.extractionService.mergeAiResults(
|
||||||
extractionResult,
|
extractionResult,
|
||||||
|
|
|
||||||
|
|
@ -43,8 +43,9 @@ You are generating a document in {output_format.upper()} format for the title: "
|
||||||
|
|
||||||
Rules:
|
Rules:
|
||||||
- The user's intent fully defines the structure. Do not assume a fixed template or headings.
|
- The user's intent fully defines the structure. Do not assume a fixed template or headings.
|
||||||
- Use only factual information extracted from the supplied source documents.
|
- Work with whatever data is available from the source documents - partial data is better than no data.
|
||||||
- Do not invent, hallucinate, or include placeholders (e.g., "lorem ipsum", "TBD").
|
- If some information is missing, create the best possible document with what you have available.
|
||||||
|
- Do not refuse to generate the document due to incomplete data - always proceed with available information.
|
||||||
- The output must strictly follow the target format and be ready for saving without extra wrapping.
|
- The output must strictly follow the target format and be ready for saving without extra wrapping.
|
||||||
- At the VERY TOP output exactly one line with the filename header:
|
- At the VERY TOP output exactly one line with the filename header:
|
||||||
FILENAME: <safe-file-name-with-extension>
|
FILENAME: <safe-file-name-with-extension>
|
||||||
|
|
@ -55,7 +56,8 @@ Rules:
|
||||||
|
|
||||||
Common policy:
|
Common policy:
|
||||||
- Use the actual data from the source documents to create the content.
|
- Use the actual data from the source documents to create the content.
|
||||||
- Do not generate placeholder text or templates.
|
- If data is incomplete, work with what you have and create a meaningful document.
|
||||||
|
- Always generate the document - never refuse due to missing information.
|
||||||
- Extract and use the real data provided in the source documents to create meaningful content.
|
- Extract and use the real data provided in the source documents to create meaningful content.
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -125,7 +125,9 @@ class WorkflowService:
|
||||||
break
|
break
|
||||||
|
|
||||||
if not message_found:
|
if not message_found:
|
||||||
logger.warning(f"Message with ID {message_id} not found in workflow. Available message IDs: {[str(msg.id) for msg in workflow.messages]}")
|
available_ids = [str(msg.id) for msg in workflow.messages]
|
||||||
|
logger.error(f"Message with ID {message_id} not found in workflow. Available message IDs: {available_ids}")
|
||||||
|
raise ValueError(f"Document reference not found: docList:{message_id}:{label}")
|
||||||
elif len(parts) >= 2:
|
elif len(parts) >= 2:
|
||||||
# Format: docList:<label> - find message by documentsLabel
|
# Format: docList:<label> - find message by documentsLabel
|
||||||
label = parts[1]
|
label = parts[1]
|
||||||
|
|
@ -154,7 +156,8 @@ class WorkflowService:
|
||||||
else:
|
else:
|
||||||
logger.debug(f"Found docList reference {doc_ref} but message has no documents")
|
logger.debug(f"Found docList reference {doc_ref} but message has no documents")
|
||||||
else:
|
else:
|
||||||
logger.debug(f"No messages found with documentsLabel: {label}")
|
logger.error(f"No messages found with documentsLabel: {label}")
|
||||||
|
raise ValueError(f"Document reference not found: docList:{label}")
|
||||||
else:
|
else:
|
||||||
# Direct label reference (round1_task2_action3_contextinfo)
|
# Direct label reference (round1_task2_action3_contextinfo)
|
||||||
# Search for messages with matching documentsLabel to find the actual documents
|
# Search for messages with matching documentsLabel to find the actual documents
|
||||||
|
|
@ -198,30 +201,8 @@ class WorkflowService:
|
||||||
else:
|
else:
|
||||||
logger.debug(f"No documents found in newest message {newest_message.id}")
|
logger.debug(f"No documents found in newest message {newest_message.id}")
|
||||||
else:
|
else:
|
||||||
logger.debug(f"No messages found with documentsLabel: {doc_ref}")
|
logger.error(f"No messages found with documentsLabel: {doc_ref}")
|
||||||
# Fallback: also check if any message has this documentsLabel as a prefix
|
raise ValueError(f"Document reference not found: {doc_ref}")
|
||||||
logger.debug(f"Trying fallback search for messages with documentsLabel containing: {doc_ref}")
|
|
||||||
fallback_messages = []
|
|
||||||
for message in workflow.messages:
|
|
||||||
msg_documents_label = getattr(message, 'documentsLabel', '')
|
|
||||||
if msg_documents_label and msg_documents_label.startswith(doc_ref):
|
|
||||||
fallback_messages.append(message)
|
|
||||||
logger.debug(f"Found fallback message {message.id} with documentsLabel: {msg_documents_label}")
|
|
||||||
|
|
||||||
if fallback_messages:
|
|
||||||
# Sort by publishedAt descending (newest first)
|
|
||||||
fallback_messages.sort(key=lambda msg: getattr(msg, 'publishedAt', 0), reverse=True)
|
|
||||||
newest_fallback = fallback_messages[0]
|
|
||||||
|
|
||||||
logger.debug(f"Using fallback message {newest_fallback.id} with documentsLabel: {getattr(newest_fallback, 'documentsLabel', 'unknown')}")
|
|
||||||
if newest_fallback.documents:
|
|
||||||
doc_names = [doc.fileName for doc in newest_fallback.documents if hasattr(doc, 'fileName')]
|
|
||||||
logger.debug(f"Added {len(newest_fallback.documents)} documents from fallback message {newest_fallback.id}: {doc_names}")
|
|
||||||
all_documents.extend(newest_fallback.documents)
|
|
||||||
else:
|
|
||||||
logger.debug(f"No documents found in fallback message {newest_fallback.id}")
|
|
||||||
else:
|
|
||||||
logger.debug(f"No fallback messages found either")
|
|
||||||
|
|
||||||
logger.debug(f"Resolved {len(all_documents)} documents from document list: {documentList}")
|
logger.debug(f"Resolved {len(all_documents)} documents from document list: {documentList}")
|
||||||
return all_documents
|
return all_documents
|
||||||
|
|
@ -622,6 +603,13 @@ class WorkflowService:
|
||||||
if not workflow or not hasattr(workflow, 'messages'):
|
if not workflow or not hasattr(workflow, 'messages'):
|
||||||
return "No documents available"
|
return "No documents available"
|
||||||
|
|
||||||
|
# Reload workflow from database to ensure we have all messages
|
||||||
|
if hasattr(workflow, 'id'):
|
||||||
|
try:
|
||||||
|
workflow = self.getWorkflow(workflow.id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not reload workflow from database: {str(e)}")
|
||||||
|
|
||||||
# Get document reference list using the exact same logic as old system
|
# Get document reference list using the exact same logic as old system
|
||||||
document_list = self._getDocumentReferenceList(workflow)
|
document_list = self._getDocumentReferenceList(workflow)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -169,12 +169,12 @@ class MethodAi(MethodBase):
|
||||||
Parameters:
|
Parameters:
|
||||||
- user_prompt (str, required): Research question or topic.
|
- user_prompt (str, required): Research question or topic.
|
||||||
- urls (list, optional): Specific URLs to crawl.
|
- urls (list, optional): Specific URLs to crawl.
|
||||||
- max_results (int, optional): Max search results. Default: 10.
|
- max_results (int, optional): Max search results. Default: 5.
|
||||||
- max_pages (int, optional): Max pages to crawl per site. Default: 10.
|
- max_pages (int, optional): Max pages to crawl per site. Default: 5.
|
||||||
- search_depth (str, optional): basic | advanced. Default: basic.
|
- search_depth (str, optional): basic | advanced. Default: basic.
|
||||||
- extract_depth (str, optional): basic | advanced. Default: advanced.
|
- extract_depth (str, optional): basic | advanced. Default: advanced.
|
||||||
- pages_search_depth (int, optional): Crawl depth level. Default: 2.
|
- pages_search_depth (int, optional): Crawl depth level. Default: 2.
|
||||||
- country (str, optional): Country code for bias.
|
- country (str, optional): Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries).
|
||||||
- time_range (str, optional): d | w | m | y.
|
- time_range (str, optional): d | w | m | y.
|
||||||
- topic (str, optional): general | news | academic.
|
- topic (str, optional): general | news | academic.
|
||||||
- language (str, optional): Language code (e.g., de, en, fr).
|
- language (str, optional): Language code (e.g., de, en, fr).
|
||||||
|
|
@ -182,8 +182,8 @@ class MethodAi(MethodBase):
|
||||||
try:
|
try:
|
||||||
user_prompt = parameters.get("user_prompt")
|
user_prompt = parameters.get("user_prompt")
|
||||||
urls = parameters.get("urls")
|
urls = parameters.get("urls")
|
||||||
max_results = parameters.get("max_results", 10)
|
max_results = parameters.get("max_results", 5)
|
||||||
max_pages = parameters.get("max_pages", 10)
|
max_pages = parameters.get("max_pages", 5)
|
||||||
search_depth = parameters.get("search_depth", "basic")
|
search_depth = parameters.get("search_depth", "basic")
|
||||||
extract_depth = parameters.get("extract_depth", "advanced")
|
extract_depth = parameters.get("extract_depth", "advanced")
|
||||||
pages_search_depth = parameters.get("pages_search_depth", 2)
|
pages_search_depth = parameters.get("pages_search_depth", 2)
|
||||||
|
|
|
||||||
|
|
@ -42,9 +42,7 @@ class MethodDocument(MethodBase):
|
||||||
- operationType (str, optional): extract_content | analyze_document | summarize_content. Default: extract_content.
|
- operationType (str, optional): extract_content | analyze_document | summarize_content. Default: extract_content.
|
||||||
- processDocumentsIndividually (bool, optional): Process each document separately. Default: True.
|
- processDocumentsIndividually (bool, optional): Process each document separately. Default: True.
|
||||||
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
|
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
|
||||||
- mergeStrategy (dict, optional): Merge strategy for chunked content.
|
- outputMimeType (str, optional): MIME type for output file. Options: "text/plain" (default), "application/json", "text/csv", "text/html". Default: "text/plain".
|
||||||
- expectedDocumentFormats (list, optional): Desired output format specs.
|
|
||||||
- includeMetadata (bool, optional): Include file metadata. Default: True.
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
documentList = parameters.get("documentList")
|
documentList = parameters.get("documentList")
|
||||||
|
|
@ -54,13 +52,7 @@ class MethodDocument(MethodBase):
|
||||||
operationType = parameters.get("operationType", "extract_content")
|
operationType = parameters.get("operationType", "extract_content")
|
||||||
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
|
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
|
||||||
chunkAllowed = parameters.get("chunkAllowed", True)
|
chunkAllowed = parameters.get("chunkAllowed", True)
|
||||||
mergeStrategy = parameters.get("mergeStrategy", {
|
outputMimeType = parameters.get("outputMimeType", "text/plain")
|
||||||
"groupBy": "typeGroup",
|
|
||||||
"orderBy": "id",
|
|
||||||
"mergeType": "concatenate"
|
|
||||||
})
|
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
|
||||||
includeMetadata = parameters.get("includeMetadata", True)
|
|
||||||
|
|
||||||
if not documentList:
|
if not documentList:
|
||||||
return ActionResult.isFailure(
|
return ActionResult.isFailure(
|
||||||
|
|
@ -87,19 +79,16 @@ class MethodDocument(MethodBase):
|
||||||
compressContext=not chunkAllowed
|
compressContext=not chunkAllowed
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add format instructions to prompt if expected formats are provided
|
# Add format instructions to prompt based on MIME type
|
||||||
enhanced_prompt = prompt
|
enhanced_prompt = prompt
|
||||||
if expectedDocumentFormats:
|
mime_type_mapping = {
|
||||||
format_instructions = []
|
"text/plain": (".txt", "Plain text format"),
|
||||||
for fmt in expectedDocumentFormats:
|
"application/json": (".json", "Structured JSON format"),
|
||||||
extension = fmt.get("extension", ".txt")
|
"text/csv": (".csv", "Table format"),
|
||||||
mime_type = fmt.get("mimeType", "text/plain")
|
"text/html": (".html", "HTML format")
|
||||||
description = fmt.get("description", "")
|
}
|
||||||
format_instructions.append(f"- {extension} ({mime_type}): {description}")
|
extension, description = mime_type_mapping.get(outputMimeType, (".txt", "Plain text format"))
|
||||||
|
enhanced_prompt += f"\n\nPlease format the output as {extension} ({outputMimeType}): {description}"
|
||||||
if format_instructions:
|
|
||||||
enhanced_prompt += f"\n\nPlease format the output as: {', '.join([fmt.get('extension', '.txt') for fmt in expectedDocumentFormats])}"
|
|
||||||
enhanced_prompt += f"\nExpected formats:\n" + "\n".join(format_instructions)
|
|
||||||
|
|
||||||
# Use enhanced AI service for extraction
|
# Use enhanced AI service for extraction
|
||||||
ai_response = await self.services.ai.callAi(
|
ai_response = await self.services.ai.callAi(
|
||||||
|
|
@ -125,8 +114,16 @@ class MethodDocument(MethodBase):
|
||||||
for i, chatDocument in enumerate(chatDocuments):
|
for i, chatDocument in enumerate(chatDocuments):
|
||||||
# Use the AI response directly - it already contains processed content
|
# Use the AI response directly - it already contains processed content
|
||||||
final_content = ai_response
|
final_content = ai_response
|
||||||
final_mime_type = "text/plain"
|
|
||||||
final_extension = ".txt"
|
# Determine output format based on MIME type
|
||||||
|
mime_type_mapping = {
|
||||||
|
"text/plain": ".txt",
|
||||||
|
"application/json": ".json",
|
||||||
|
"text/csv": ".csv",
|
||||||
|
"text/html": ".html"
|
||||||
|
}
|
||||||
|
final_extension = mime_type_mapping.get(outputMimeType, ".txt")
|
||||||
|
final_mime_type = outputMimeType
|
||||||
|
|
||||||
# Create meaningful output fileName with workflow context
|
# Create meaningful output fileName with workflow context
|
||||||
original_fileName = chatDocument.fileName
|
original_fileName = chatDocument.fileName
|
||||||
|
|
@ -175,8 +172,6 @@ class MethodDocument(MethodBase):
|
||||||
- operationType (str, optional): generate_report | analyze_documents. Default: generate_report.
|
- operationType (str, optional): generate_report | analyze_documents. Default: generate_report.
|
||||||
- processDocumentsIndividually (bool, optional): Process per document. Default: True.
|
- processDocumentsIndividually (bool, optional): Process per document. Default: True.
|
||||||
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
|
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
|
||||||
- mergeStrategy (dict, optional): Merging rules for multi-part generation.
|
|
||||||
- includeMetadata (bool, optional): Include file metadata. Default: True.
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
documentList = parameters.get("documentList")
|
documentList = parameters.get("documentList")
|
||||||
|
|
@ -188,12 +183,6 @@ class MethodDocument(MethodBase):
|
||||||
operationType = parameters.get("operationType", "generate_report")
|
operationType = parameters.get("operationType", "generate_report")
|
||||||
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
|
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
|
||||||
chunkAllowed = parameters.get("chunkAllowed", True)
|
chunkAllowed = parameters.get("chunkAllowed", True)
|
||||||
mergeStrategy = parameters.get("mergeStrategy", {
|
|
||||||
"groupBy": "typeGroup",
|
|
||||||
"orderBy": "id",
|
|
||||||
"mergeType": "concatenate"
|
|
||||||
})
|
|
||||||
includeMetadata = parameters.get("includeMetadata", True)
|
|
||||||
|
|
||||||
if not documentList:
|
if not documentList:
|
||||||
return ActionResult.isFailure(
|
return ActionResult.isFailure(
|
||||||
|
|
|
||||||
|
|
@ -154,7 +154,13 @@ class MethodOutlook(MethodBase):
|
||||||
if not query or not query.strip():
|
if not query or not query.strip():
|
||||||
# No query specified, just get emails from folder
|
# No query specified, just get emails from folder
|
||||||
if folder and folder.lower() != "all":
|
if folder and folder.lower() != "all":
|
||||||
params["$filter"] = f"parentFolderId eq '{folder}'"
|
# Use folder name directly for well-known folders, or get folder ID
|
||||||
|
if folder.lower() in ["inbox", "drafts", "sentitems", "deleteditems"]:
|
||||||
|
params["$filter"] = f"parentFolderId eq '{folder}'"
|
||||||
|
else:
|
||||||
|
# For custom folders, we need to get the folder ID first
|
||||||
|
# This will be handled by the calling method
|
||||||
|
params["$filter"] = f"parentFolderId eq '{folder}'"
|
||||||
# Add orderby for basic queries
|
# Add orderby for basic queries
|
||||||
params["$orderby"] = "receivedDateTime desc"
|
params["$orderby"] = "receivedDateTime desc"
|
||||||
return params
|
return params
|
||||||
|
|
@ -191,11 +197,21 @@ class MethodOutlook(MethodBase):
|
||||||
|
|
||||||
|
|
||||||
# Use only subject search to keep filter simple
|
# Use only subject search to keep filter simple
|
||||||
params["$filter"] = f"contains(subject,'{clean_query}')"
|
# Handle wildcard queries specially
|
||||||
|
if clean_query == "*" or clean_query == "":
|
||||||
|
# For wildcard or empty query, don't use contains filter
|
||||||
|
# Just use folder filter if specified
|
||||||
|
if folder and folder.lower() != "all":
|
||||||
|
params["$filter"] = f"parentFolderId eq '{folder}'"
|
||||||
|
else:
|
||||||
|
# No filter needed for wildcard search across all folders
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
params["$filter"] = f"contains(subject,'{clean_query}')"
|
||||||
|
|
||||||
# Add folder filter if specified
|
# Add folder filter if specified
|
||||||
if folder and folder.lower() != "all":
|
if folder and folder.lower() != "all":
|
||||||
params["$filter"] = f"{params['$filter']} and parentFolderId eq '{folder}'"
|
params["$filter"] = f"{params['$filter']} and parentFolderId eq '{folder}'"
|
||||||
|
|
||||||
# Add orderby for basic queries
|
# Add orderby for basic queries
|
||||||
params["$orderby"] = "receivedDateTime desc"
|
params["$orderby"] = "receivedDateTime desc"
|
||||||
|
|
@ -300,26 +316,31 @@ class MethodOutlook(MethodBase):
|
||||||
"""
|
"""
|
||||||
GENERAL:
|
GENERAL:
|
||||||
- Purpose: Read emails and metadata from a mailbox folder.
|
- Purpose: Read emails and metadata from a mailbox folder.
|
||||||
- Input requirements: connectionReference (required); optional folder, limit, filter, expectedDocumentFormats.
|
- Input requirements: connectionReference (required); optional folder, limit, filter, outputMimeType.
|
||||||
- Output format: JSON with emails and metadata.
|
- Output format: JSON with emails and metadata.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- connectionReference (str, required): Microsoft connection label.
|
- connectionReference (str, required): Microsoft connection label.
|
||||||
- folder (str, optional): Folder to read from. Default: Inbox.
|
- folder (str, optional): Folder to read from. Default: Inbox.
|
||||||
- limit (int, optional): Maximum items to return. Default: 10.
|
- limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
|
||||||
- filter (str, optional): Sender, query operators, or subject text.
|
- filter (str, optional): Sender, query operators, or subject text.
|
||||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
connectionReference = parameters.get("connectionReference")
|
connectionReference = parameters.get("connectionReference")
|
||||||
folder = parameters.get("folder", "Inbox")
|
folder = parameters.get("folder", "Inbox")
|
||||||
limit = parameters.get("limit", 10)
|
limit = parameters.get("limit", 10)
|
||||||
filter = parameters.get("filter")
|
filter = parameters.get("filter")
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||||
|
|
||||||
if not connectionReference:
|
if not connectionReference:
|
||||||
return ActionResult.isFailure(error="Connection reference is required")
|
return ActionResult.isFailure(error="Connection reference is required")
|
||||||
|
|
||||||
|
# Validate limit parameter
|
||||||
|
if limit <= 0:
|
||||||
|
limit = 1000
|
||||||
|
logger.warning(f"Invalid limit value ({limit}), using default value 1000")
|
||||||
|
|
||||||
# Validate filter parameter if provided
|
# Validate filter parameter if provided
|
||||||
if filter:
|
if filter:
|
||||||
# Remove any potentially dangerous characters that could break the filter
|
# Remove any potentially dangerous characters that could break the filter
|
||||||
|
|
@ -343,8 +364,16 @@ class MethodOutlook(MethodBase):
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Build the API request
|
# Get the folder ID for the specified folder
|
||||||
api_url = f"{graph_url}/me/mailFolders/{folder}/messages"
|
folder_id = self._getFolderId(folder, connection)
|
||||||
|
|
||||||
|
if folder_id:
|
||||||
|
# Build the API request with folder ID
|
||||||
|
api_url = f"{graph_url}/me/mailFolders/{folder_id}/messages"
|
||||||
|
else:
|
||||||
|
# Fallback: use folder name directly (for well-known folders like "Inbox")
|
||||||
|
api_url = f"{graph_url}/me/mailFolders/{folder}/messages"
|
||||||
|
logger.warning(f"Could not find folder ID for '{folder}', using folder name directly")
|
||||||
params = {
|
params = {
|
||||||
"$top": limit,
|
"$top": limit,
|
||||||
"$orderby": "receivedDateTime desc"
|
"$orderby": "receivedDateTime desc"
|
||||||
|
|
@ -380,7 +409,11 @@ class MethodOutlook(MethodBase):
|
||||||
"count": len(emails_data.get("value", [])),
|
"count": len(emails_data.get("value", [])),
|
||||||
"folder": folder,
|
"folder": folder,
|
||||||
"filter": filter,
|
"filter": filter,
|
||||||
"apiResponse": emails_data
|
"apiMetadata": {
|
||||||
|
"@odata.context": emails_data.get("@odata.context"),
|
||||||
|
"@odata.count": emails_data.get("@odata.count"),
|
||||||
|
"@odata.nextLink": emails_data.get("@odata.nextLink")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -405,18 +438,15 @@ class MethodOutlook(MethodBase):
|
||||||
logger.error(f"Error reading emails from Microsoft Graph API: {str(e)}")
|
logger.error(f"Error reading emails from Microsoft Graph API: {str(e)}")
|
||||||
return ActionResult.isFailure(error=f"Failed to read emails: {str(e)}")
|
return ActionResult.isFailure(error=f"Failed to read emails: {str(e)}")
|
||||||
|
|
||||||
# Determine output format based on expected formats
|
# Determine output format based on MIME type
|
||||||
output_extension = ".json" # Default
|
mime_type_mapping = {
|
||||||
output_mime_type = "application/json" # Default
|
"application/json": ".json",
|
||||||
|
"text/plain": ".txt",
|
||||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
"text/csv": ".csv"
|
||||||
# Use the first expected format
|
}
|
||||||
expected_format = expectedDocumentFormats[0]
|
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||||
output_extension = expected_format.get("extension", ".json")
|
output_mime_type = outputMimeType
|
||||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
|
||||||
else:
|
|
||||||
logger.info("No expected format specified, using default .json format")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -454,27 +484,32 @@ class MethodOutlook(MethodBase):
|
||||||
"""
|
"""
|
||||||
GENERAL:
|
GENERAL:
|
||||||
- Purpose: Search emails by query and return matching items with metadata.
|
- Purpose: Search emails by query and return matching items with metadata.
|
||||||
- Input requirements: connectionReference (required); query (required); optional folder, limit, expectedDocumentFormats.
|
- Input requirements: connectionReference (required); query (required); optional folder, limit, outputMimeType.
|
||||||
- Output format: JSON with search results and metadata.
|
- Output format: JSON with search results and metadata.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- connectionReference (str, required): Microsoft connection label.
|
- connectionReference (str, required): Microsoft connection label.
|
||||||
- query (str, required): Search expression.
|
- query (str, required): Search expression.
|
||||||
- folder (str, optional): Folder scope or All. Default: All.
|
- folder (str, optional): Folder scope or All. Default: All.
|
||||||
- limit (int, optional): Maximum items to return. Default: 20.
|
- limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
|
||||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
connectionReference = parameters.get("connectionReference")
|
connectionReference = parameters.get("connectionReference")
|
||||||
query = parameters.get("query")
|
query = parameters.get("query")
|
||||||
folder = parameters.get("folder", "All")
|
folder = parameters.get("folder", "All")
|
||||||
limit = parameters.get("limit", 20)
|
limit = parameters.get("limit", 1000)
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||||
|
|
||||||
# Validate parameters
|
# Validate parameters
|
||||||
if not connectionReference:
|
if not connectionReference:
|
||||||
return ActionResult.isFailure(error="Connection reference is required")
|
return ActionResult.isFailure(error="Connection reference is required")
|
||||||
|
|
||||||
|
# Validate limit parameter
|
||||||
|
if limit <= 0:
|
||||||
|
limit = 1000
|
||||||
|
logger.warning(f"Invalid limit value ({limit}), using default value 1000")
|
||||||
|
|
||||||
if not query or not query.strip():
|
if not query or not query.strip():
|
||||||
return ActionResult.isFailure(error="Search query is required and cannot be empty")
|
return ActionResult.isFailure(error="Search query is required and cannot be empty")
|
||||||
|
|
||||||
|
|
@ -488,12 +523,15 @@ class MethodOutlook(MethodBase):
|
||||||
# Validate limit
|
# Validate limit
|
||||||
try:
|
try:
|
||||||
limit = int(limit)
|
limit = int(limit)
|
||||||
if limit <= 0 or limit > 1000: # Microsoft Graph API has limits
|
if limit <= 0:
|
||||||
limit = 20
|
limit = 1000
|
||||||
logger.warning(f"Limit {limit} is out of range, using default value 20")
|
logger.warning(f"Invalid limit value (<=0), using default value 1000")
|
||||||
|
elif limit > 1000: # Microsoft Graph API has limits
|
||||||
|
limit = 1000
|
||||||
|
logger.warning(f"Limit {limit} exceeds maximum (1000), using 1000")
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
limit = 20
|
limit = 1000
|
||||||
logger.warning(f"Invalid limit value, using default value 20")
|
logger.warning(f"Invalid limit value, using default value 1000")
|
||||||
|
|
||||||
# Get Microsoft connection
|
# Get Microsoft connection
|
||||||
connection = self._getMicrosoftConnection(connectionReference)
|
connection = self._getMicrosoftConnection(connectionReference)
|
||||||
|
|
@ -509,9 +547,18 @@ class MethodOutlook(MethodBase):
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Get the folder ID for the specified folder if needed
|
||||||
|
folder_id = None
|
||||||
|
if folder and folder.lower() != "all":
|
||||||
|
folder_id = self._getFolderId(folder, connection)
|
||||||
|
if folder_id:
|
||||||
|
logger.debug(f"Found folder ID for '{folder}': {folder_id}")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Could not find folder ID for '{folder}', using folder name directly")
|
||||||
|
|
||||||
# Build the search API request
|
# Build the search API request
|
||||||
api_url = f"{graph_url}/me/messages"
|
api_url = f"{graph_url}/me/messages"
|
||||||
params = self._buildSearchParameters(query, folder, limit)
|
params = self._buildSearchParameters(query, folder_id or folder, limit)
|
||||||
|
|
||||||
# Log search parameters for debugging
|
# Log search parameters for debugging
|
||||||
logger.debug(f"Search query: '{query}'")
|
logger.debug(f"Search query: '{query}'")
|
||||||
|
|
@ -605,7 +652,11 @@ class MethodOutlook(MethodBase):
|
||||||
"count": len(emails),
|
"count": len(emails),
|
||||||
"folder": folder,
|
"folder": folder,
|
||||||
"limit": limit,
|
"limit": limit,
|
||||||
"apiResponse": search_data,
|
"apiMetadata": {
|
||||||
|
"@odata.context": search_data.get("@odata.context"),
|
||||||
|
"@odata.count": search_data.get("@odata.count"),
|
||||||
|
"@odata.nextLink": search_data.get("@odata.nextLink")
|
||||||
|
},
|
||||||
"searchParams": params
|
"searchParams": params
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -618,18 +669,15 @@ class MethodOutlook(MethodBase):
|
||||||
logger.error(f"Error searching emails via Microsoft Graph API: {str(e)}")
|
logger.error(f"Error searching emails via Microsoft Graph API: {str(e)}")
|
||||||
return ActionResult.isFailure(error=f"Failed to search emails: {str(e)}")
|
return ActionResult.isFailure(error=f"Failed to search emails: {str(e)}")
|
||||||
|
|
||||||
# Determine output format based on expected formats
|
# Determine output format based on MIME type
|
||||||
output_extension = ".json" # Default
|
mime_type_mapping = {
|
||||||
output_mime_type = "application/json" # Default
|
"application/json": ".json",
|
||||||
|
"text/plain": ".txt",
|
||||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
"text/csv": ".csv"
|
||||||
# Use the first expected format
|
}
|
||||||
expected_format = expectedDocumentFormats[0]
|
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||||
output_extension = expected_format.get("extension", ".json")
|
output_mime_type = outputMimeType
|
||||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
|
||||||
else:
|
|
||||||
logger.info("No expected format specified, using default .json format")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -664,20 +712,20 @@ class MethodOutlook(MethodBase):
|
||||||
"""
|
"""
|
||||||
GENERAL:
|
GENERAL:
|
||||||
- Purpose: List draft emails from a folder.
|
- Purpose: List draft emails from a folder.
|
||||||
- Input requirements: connectionReference (required); optional folder, limit, expectedDocumentFormats.
|
- Input requirements: connectionReference (required); optional folder, limit, outputMimeType.
|
||||||
- Output format: JSON with draft items and metadata.
|
- Output format: JSON with draft items and metadata.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- connectionReference (str, required): Microsoft connection label.
|
- connectionReference (str, required): Microsoft connection label.
|
||||||
- folder (str, optional): Drafts folder to list. Default: Drafts.
|
- folder (str, optional): Drafts folder to list. Default: Drafts.
|
||||||
- limit (int, optional): Maximum items to return. Default: 20.
|
- limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
|
||||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
connectionReference = parameters.get("connectionReference")
|
connectionReference = parameters.get("connectionReference")
|
||||||
folder = parameters.get("folder", "Drafts")
|
folder = parameters.get("folder", "Drafts")
|
||||||
limit = parameters.get("limit", 20)
|
limit = parameters.get("limit", 1000)
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||||
|
|
||||||
if not connectionReference:
|
if not connectionReference:
|
||||||
return ActionResult.isFailure(error="Connection reference is required")
|
return ActionResult.isFailure(error="Connection reference is required")
|
||||||
|
|
@ -745,18 +793,15 @@ class MethodOutlook(MethodBase):
|
||||||
logger.error(f"Error listing drafts via Microsoft Graph API: {str(e)}")
|
logger.error(f"Error listing drafts via Microsoft Graph API: {str(e)}")
|
||||||
return ActionResult.isFailure(error=f"Failed to list drafts: {str(e)}")
|
return ActionResult.isFailure(error=f"Failed to list drafts: {str(e)}")
|
||||||
|
|
||||||
# Determine output format based on expected formats
|
# Determine output format based on MIME type
|
||||||
output_extension = ".json" # Default
|
mime_type_mapping = {
|
||||||
output_mime_type = "application/json" # Default
|
"application/json": ".json",
|
||||||
|
"text/plain": ".txt",
|
||||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
"text/csv": ".csv"
|
||||||
# Use the first expected format
|
}
|
||||||
expected_format = expectedDocumentFormats[0]
|
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||||
output_extension = expected_format.get("extension", ".json")
|
output_mime_type = outputMimeType
|
||||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
|
||||||
else:
|
|
||||||
logger.info("No expected format specified, using default .json format")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -790,18 +835,18 @@ class MethodOutlook(MethodBase):
|
||||||
"""
|
"""
|
||||||
GENERAL:
|
GENERAL:
|
||||||
- Purpose: Find draft emails across folders.
|
- Purpose: Find draft emails across folders.
|
||||||
- Input requirements: connectionReference (required); optional limit, expectedDocumentFormats.
|
- Input requirements: connectionReference (required); optional limit, outputMimeType.
|
||||||
- Output format: JSON with drafts and metadata.
|
- Output format: JSON with drafts and metadata.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- connectionReference (str, required): Microsoft connection label.
|
- connectionReference (str, required): Microsoft connection label.
|
||||||
- limit (int, optional): Maximum items to return. Default: 50.
|
- limit (int, optional): Maximum items to return. Default: 50.
|
||||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
connectionReference = parameters.get("connectionReference")
|
connectionReference = parameters.get("connectionReference")
|
||||||
limit = parameters.get("limit", 50)
|
limit = parameters.get("limit", 50)
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||||
|
|
||||||
if not connectionReference:
|
if not connectionReference:
|
||||||
return ActionResult.isFailure(error="Connection reference is required")
|
return ActionResult.isFailure(error="Connection reference is required")
|
||||||
|
|
@ -859,18 +904,15 @@ class MethodOutlook(MethodBase):
|
||||||
logger.error(f"Error finding drafts via Microsoft Graph API: {str(e)}")
|
logger.error(f"Error finding drafts via Microsoft Graph API: {str(e)}")
|
||||||
return ActionResult.isFailure(error=f"Failed to find drafts: {str(e)}")
|
return ActionResult.isFailure(error=f"Failed to find drafts: {str(e)}")
|
||||||
|
|
||||||
# Determine output format based on expected formats
|
# Determine output format based on MIME type
|
||||||
output_extension = ".json" # Default
|
mime_type_mapping = {
|
||||||
output_mime_type = "application/json" # Default
|
"application/json": ".json",
|
||||||
|
"text/plain": ".txt",
|
||||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
"text/csv": ".csv"
|
||||||
# Use the first expected format
|
}
|
||||||
expected_format = expectedDocumentFormats[0]
|
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||||
output_extension = expected_format.get("extension", ".json")
|
output_mime_type = outputMimeType
|
||||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
|
||||||
else:
|
|
||||||
logger.info("No expected format specified, using default .json format")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -930,18 +972,18 @@ class MethodOutlook(MethodBase):
|
||||||
"""
|
"""
|
||||||
GENERAL:
|
GENERAL:
|
||||||
- Purpose: Check contents of the Drafts folder.
|
- Purpose: Check contents of the Drafts folder.
|
||||||
- Input requirements: connectionReference (required); optional limit, expectedDocumentFormats.
|
- Input requirements: connectionReference (required); optional limit, outputMimeType.
|
||||||
- Output format: JSON with drafts and metadata.
|
- Output format: JSON with drafts and metadata.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- connectionReference (str, required): Microsoft connection label.
|
- connectionReference (str, required): Microsoft connection label.
|
||||||
- limit (int, optional): Maximum items to return. Default: 20.
|
- limit (int, optional): Maximum items to return. Default: 20.
|
||||||
- expectedDocumentFormats (list, optional): Output format preferences.
|
- outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
connectionReference = parameters.get("connectionReference")
|
connectionReference = parameters.get("connectionReference")
|
||||||
limit = parameters.get("limit", 20)
|
limit = parameters.get("limit", 20)
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
outputMimeType = parameters.get("outputMimeType", "application/json")
|
||||||
|
|
||||||
if not connectionReference:
|
if not connectionReference:
|
||||||
return ActionResult.isFailure(error="Connection reference is required")
|
return ActionResult.isFailure(error="Connection reference is required")
|
||||||
|
|
@ -1003,18 +1045,15 @@ class MethodOutlook(MethodBase):
|
||||||
logger.error(f"Error checking Drafts folder via Microsoft Graph API: {str(e)}")
|
logger.error(f"Error checking Drafts folder via Microsoft Graph API: {str(e)}")
|
||||||
return ActionResult.isFailure(error=f"Failed to check Drafts folder: {str(e)}")
|
return ActionResult.isFailure(error=f"Failed to check Drafts folder: {str(e)}")
|
||||||
|
|
||||||
# Determine output format based on expected formats
|
# Determine output format based on MIME type
|
||||||
output_extension = ".json" # Default
|
mime_type_mapping = {
|
||||||
output_mime_type = "application/json" # Default
|
"application/json": ".json",
|
||||||
|
"text/plain": ".txt",
|
||||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
"text/csv": ".csv"
|
||||||
# Use the first expected format
|
}
|
||||||
expected_format = expectedDocumentFormats[0]
|
output_extension = mime_type_mapping.get(outputMimeType, ".json")
|
||||||
output_extension = expected_format.get("extension", ".json")
|
output_mime_type = outputMimeType
|
||||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
logger.info(f"Using output format: {output_extension} ({output_mime_type})")
|
||||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
|
||||||
else:
|
|
||||||
logger.info("No expected format specified, using default .json format")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -49,13 +49,14 @@ class ReactMode(BaseMode):
|
||||||
"""Execute task using React mode - iterative plan-act-observe-refine loop"""
|
"""Execute task using React mode - iterative plan-act-observe-refine loop"""
|
||||||
logger.info(f"=== STARTING TASK {taskIndex or '?'}: {taskStep.objective} ===")
|
logger.info(f"=== STARTING TASK {taskIndex or '?'}: {taskStep.objective} ===")
|
||||||
|
|
||||||
# NEW: Analyze user intent with both original prompt and task objective
|
# NEW: Analyze intents separately for proper validation vs task completion
|
||||||
# Get original user prompt from services (clean and reliable)
|
# Workflow-level intent from cleaned original user prompt
|
||||||
original_prompt = self.services.currentUserPrompt if self.services and hasattr(self.services, 'currentUserPrompt') else taskStep.objective
|
original_prompt = self.services.currentUserPrompt if self.services and hasattr(self.services, 'currentUserPrompt') else taskStep.objective
|
||||||
combined_context = f"Original request: {original_prompt}\n\nCurrent task: {taskStep.objective}"
|
self.workflowIntent = self.intentAnalyzer.analyzeUserIntent(original_prompt, context)
|
||||||
|
# Task-level intent from current task objective (used only for task-scoped checks)
|
||||||
self.currentIntent = self.intentAnalyzer.analyzeUserIntent(combined_context, context)
|
self.taskIntent = self.intentAnalyzer.analyzeUserIntent(taskStep.objective, context)
|
||||||
logger.info(f"Intent analysis (original + task): {self.currentIntent}")
|
logger.info(f"Intent analysis — workflow: {self.workflowIntent}")
|
||||||
|
logger.info(f"Intent analysis — task: {self.taskIntent}")
|
||||||
|
|
||||||
# NEW: Reset progress tracking for new task
|
# NEW: Reset progress tracking for new task
|
||||||
self.progressTracker.reset()
|
self.progressTracker.reset()
|
||||||
|
|
@ -99,18 +100,18 @@ class ReactMode(BaseMode):
|
||||||
# Attach deterministic label for clarity
|
# Attach deterministic label for clarity
|
||||||
observation['resultLabel'] = result.resultLabel
|
observation['resultLabel'] = result.resultLabel
|
||||||
|
|
||||||
# NEW: Add content validation
|
# NEW: Add content validation (against original cleaned user prompt / workflow intent)
|
||||||
if self.currentIntent and result.documents:
|
if getattr(self, 'workflowIntent', None) and result.documents:
|
||||||
validationResult = self.contentValidator.validateContent(result.documents, self.currentIntent)
|
validationResult = self.contentValidator.validateContent(result.documents, self.workflowIntent)
|
||||||
observation['contentValidation'] = validationResult
|
observation['contentValidation'] = validationResult
|
||||||
logger.info(f"Content validation: {validationResult['overallSuccess']} (quality: {validationResult['qualityScore']:.2f})")
|
logger.info(f"Content validation: {validationResult['overallSuccess']} (quality: {validationResult['qualityScore']:.2f})")
|
||||||
|
|
||||||
# NEW: Learn from feedback
|
# NEW: Learn from feedback
|
||||||
feedback = self._collectFeedback(result, validationResult, self.currentIntent)
|
feedback = self._collectFeedback(result, validationResult, self.workflowIntent)
|
||||||
self.learningEngine.learnFromFeedback(feedback, context, self.currentIntent)
|
self.learningEngine.learnFromFeedback(feedback, context, self.workflowIntent)
|
||||||
|
|
||||||
# NEW: Update progress
|
# NEW: Update progress
|
||||||
self.progressTracker.updateProgress(result, validationResult, self.currentIntent)
|
self.progressTracker.updateProgress(result, validationResult, self.workflowIntent)
|
||||||
|
|
||||||
decision = await self._refineDecide(context, observation)
|
decision = await self._refineDecide(context, observation)
|
||||||
|
|
||||||
|
|
@ -204,6 +205,11 @@ class ReactMode(BaseMode):
|
||||||
selection = json.loads(response[jsonStart:jsonEnd])
|
selection = json.loads(response[jsonStart:jsonEnd])
|
||||||
if 'action' not in selection or not isinstance(selection['action'], str):
|
if 'action' not in selection or not isinstance(selection['action'], str):
|
||||||
raise ValueError("Selection missing 'action' as string")
|
raise ValueError("Selection missing 'action' as string")
|
||||||
|
|
||||||
|
# Validate document references - prevent AI from inventing Message IDs
|
||||||
|
if 'requiredInputDocuments' in selection:
|
||||||
|
self._validateDocumentReferences(selection['requiredInputDocuments'], context)
|
||||||
|
|
||||||
# Enforce spec: Stage 1 must NOT include 'parameters'
|
# Enforce spec: Stage 1 must NOT include 'parameters'
|
||||||
if 'parameters' in selection:
|
if 'parameters' in selection:
|
||||||
# Remove to avoid accidental carryover
|
# Remove to avoid accidental carryover
|
||||||
|
|
@ -213,6 +219,38 @@ class ReactMode(BaseMode):
|
||||||
selection['parameters'] = None
|
selection['parameters'] = None
|
||||||
return selection
|
return selection
|
||||||
|
|
||||||
|
def _validateDocumentReferences(self, document_refs: List[str], context: TaskContext) -> None:
|
||||||
|
"""Validate that document references exist in the current workflow"""
|
||||||
|
if not document_refs:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get available documents from the current workflow
|
||||||
|
try:
|
||||||
|
available_docs = self.services.workflow.getAvailableDocuments(context.workflow)
|
||||||
|
if not available_docs or available_docs == "No documents available":
|
||||||
|
logger.warning("No documents available for validation")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Extract all valid references from available documents
|
||||||
|
valid_refs = []
|
||||||
|
for line in available_docs.split('\n'):
|
||||||
|
if 'docList:' in line or 'docItem:' in line:
|
||||||
|
# Extract reference from line like " - docList:msg_xxx:label"
|
||||||
|
ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+)', line)
|
||||||
|
if ref_match:
|
||||||
|
valid_refs.append(ref_match.group(1))
|
||||||
|
|
||||||
|
# Check if all provided references are valid
|
||||||
|
for ref in document_refs:
|
||||||
|
if ref not in valid_refs:
|
||||||
|
logger.error(f"Invalid document reference: {ref}")
|
||||||
|
logger.error(f"Available references: {valid_refs}")
|
||||||
|
raise ValueError(f"Document reference '{ref}' not found in available documents. Use only exact references from AVAILABLE_DOCUMENTS_INDEX.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error validating document references: {str(e)}")
|
||||||
|
raise ValueError(f"Failed to validate document references: {str(e)}")
|
||||||
|
|
||||||
async def _actExecute(self, context: TaskContext, selection: Dict[str, Any], taskStep: TaskStep,
|
async def _actExecute(self, context: TaskContext, selection: Dict[str, Any], taskStep: TaskStep,
|
||||||
workflow: ChatWorkflow, stepIndex: int) -> ActionResult:
|
workflow: ChatWorkflow, stepIndex: int) -> ActionResult:
|
||||||
"""Act: request minimal parameters then execute selected action"""
|
"""Act: request minimal parameters then execute selected action"""
|
||||||
|
|
|
||||||
|
|
@ -72,6 +72,9 @@ RULES:
|
||||||
3. parametersContext must be short and sufficient for Stage 2
|
3. parametersContext must be short and sufficient for Stage 2
|
||||||
4. Return ONLY JSON - no markdown, no explanations
|
4. Return ONLY JSON - no markdown, no explanations
|
||||||
5. For requiredInputDocuments, use ONLY exact references from AVAILABLE_DOCUMENTS_INDEX (docList:... or docItem:...)
|
5. For requiredInputDocuments, use ONLY exact references from AVAILABLE_DOCUMENTS_INDEX (docList:... or docItem:...)
|
||||||
|
- DO NOT invent or modify Message IDs
|
||||||
|
- DO NOT create new references
|
||||||
|
- Copy references EXACTLY as shown in AVAILABLE_DOCUMENTS_INDEX
|
||||||
6. For requiredConnection, use ONLY an exact label from AVAILABLE_CONNECTIONS_INDEX
|
6. For requiredConnection, use ONLY an exact label from AVAILABLE_CONNECTIONS_INDEX
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue