integration fixes

This commit is contained in:
ValueOn AG 2025-10-10 23:46:24 +02:00
parent 227d7b9401
commit 73c1126200
12 changed files with 485 additions and 256 deletions

View file

@ -30,3 +30,8 @@ Web_Search_MIN_RESULTS = 1
Web_Crawl_TIMEOUT = 30 Web_Crawl_TIMEOUT = 30
Web_Crawl_MAX_RETRIES = 3 Web_Crawl_MAX_RETRIES = 3
Web_Crawl_RETRY_DELAY = 2 Web_Crawl_RETRY_DELAY = 2
# Web Research configuration
Web_Research_MAX_DEPTH = 2
Web_Research_MAX_LINKS_PER_DOMAIN = 4
Web_Research_CRAWL_TIMEOUT_MINUTES = 10

View file

@ -271,6 +271,7 @@ class ConnectorWeb:
include_domains: list[str] | None = None, include_domains: list[str] | None = None,
exclude_domains: list[str] | None = None, exclude_domains: list[str] | None = None,
language: str | None = None, language: str | None = None,
country: str | None = None,
include_answer: bool | None = None, include_answer: bool | None = None,
include_raw_content: bool | None = None, include_raw_content: bool | None = None,
) -> list[WebSearchResult]: ) -> list[WebSearchResult]:
@ -290,17 +291,20 @@ class ConnectorWeb:
kwargs["time_range"] = time_range kwargs["time_range"] = time_range
if topic is not None: if topic is not None:
kwargs["topic"] = topic kwargs["topic"] = topic
if include_domains is not None: if include_domains is not None and len(include_domains) > 0:
kwargs["include_domains"] = include_domains kwargs["include_domains"] = include_domains
if exclude_domains is not None: if exclude_domains is not None:
kwargs["exclude_domains"] = exclude_domains kwargs["exclude_domains"] = exclude_domains
if language is not None: if language is not None:
kwargs["language"] = language kwargs["language"] = language
if country is not None:
kwargs["country"] = country
if include_answer is not None: if include_answer is not None:
kwargs["include_answer"] = include_answer kwargs["include_answer"] = include_answer
if include_raw_content is not None: if include_raw_content is not None:
kwargs["include_raw_content"] = include_raw_content kwargs["include_raw_content"] = include_raw_content
logger.debug(f"Tavily.search kwargs: {kwargs}")
response = await self.client.search(**kwargs) response = await self.client.search(**kwargs)
return [ return [

View file

@ -1,4 +1,5 @@
import logging import logging
import asyncio
from typing import Dict, Any, List, Union, Tuple, Optional from typing import Dict, Any, List, Union, Tuple, Optional
from dataclasses import dataclass from dataclasses import dataclass
@ -694,7 +695,22 @@ class AiObjects:
logger.warning(f"Failed to extract links from content: {e}") logger.warning(f"Failed to extract links from content: {e}")
return [] return []
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10) -> Dict[str, str]: def _normalizeUrl(self, url: str) -> str:
"""Normalize URL to handle variations that should be considered duplicates."""
if not url:
return url
# Remove trailing slashes and fragments
url = url.rstrip('/')
if '#' in url:
url = url.split('#')[0]
# Handle common URL variations
url = url.replace('http://', 'https://') # Normalize protocol
return url
async def crawlRecursively(self, urls: List[str], max_depth: int, extract_depth: str = "advanced", max_per_domain: int = 10, global_processed_urls: Optional[set] = None) -> Dict[str, str]:
""" """
Recursively crawl URLs up to specified depth. Recursively crawl URLs up to specified depth.
@ -703,76 +719,100 @@ class AiObjects:
max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.) max_depth: Maximum depth to crawl (1=main pages only, 2=main+sub-pages, etc.)
extract_depth: Tavily extract depth setting extract_depth: Tavily extract depth setting
max_per_domain: Maximum URLs per domain per level max_per_domain: Maximum URLs per domain per level
global_processed_urls: Optional global set to track processed URLs across sessions
Returns: Returns:
Dictionary mapping URL -> content for all crawled pages Dictionary mapping URL -> content for all crawled pages
""" """
logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}") logger.info(f"Starting recursive crawl: {len(urls)} starting URLs, max_depth={max_depth}")
# URL index to track all processed URLs # URL index to track all processed URLs (local + global)
processed_urls = set() processed_urls = set()
if global_processed_urls is not None:
# Use global index if provided, otherwise create local one
processed_urls = global_processed_urls
logger.info(f"Using global URL index with {len(processed_urls)} already processed URLs")
else:
logger.info("Using local URL index for this crawl session")
all_content = {} all_content = {}
# Current level URLs to process # Current level URLs to process
current_level_urls = urls.copy() current_level_urls = urls.copy()
for depth in range(1, max_depth + 1): try:
logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===") for depth in range(1, max_depth + 1):
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}") logger.info(f"=== DEPTH LEVEL {depth}/{max_depth} ===")
logger.info(f"Processing {len(current_level_urls)} URLs at depth {depth}")
# URLs found at this level (for next iteration) # URLs found at this level (for next iteration)
next_level_urls = [] next_level_urls = []
for url in current_level_urls: for url in current_level_urls:
if url in processed_urls: # Normalize URL for duplicate checking
logger.debug(f"URL {url} already processed, skipping") normalized_url = self._normalizeUrl(url)
continue if normalized_url in processed_urls:
logger.debug(f"URL {url} (normalized: {normalized_url}) already processed, skipping")
continue
try: try:
logger.info(f"Processing URL at depth {depth}: {url}") logger.info(f"Processing URL at depth {depth}: {url}")
logger.debug(f"Total processed URLs so far: {len(processed_urls)}")
# Read page content # Read page content
content = await self.readPage(url, extract_depth) content = await self.readPage(url, extract_depth)
if content: if content:
all_content[url] = content all_content[url] = content
processed_urls.add(url) processed_urls.add(normalized_url)
logger.info(f"✓ Successfully processed {url}: {len(content)} chars") logger.info(f"✓ Successfully processed {url}: {len(content)} chars")
# Get URLs from this page for next level # Get URLs from this page for next level
page_urls = await self.getUrlsFromPage(url, extract_depth) page_urls = await self.getUrlsFromPage(url, extract_depth)
logger.info(f"Found {len(page_urls)} URLs on {url}") logger.info(f"Found {len(page_urls)} URLs on {url}")
# Filter URLs and add to next level # Filter URLs and add to next level
filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain) filtered_urls = self.filterUrlsOnlyPages(page_urls, max_per_domain)
logger.info(f"Filtered to {len(filtered_urls)} valid URLs") logger.info(f"Filtered to {len(filtered_urls)} valid URLs")
# Add new URLs to next level (avoiding already processed ones) # Add new URLs to next level (avoiding already processed ones)
new_urls_count = 0 new_urls_count = 0
for new_url in filtered_urls: for new_url in filtered_urls:
if new_url not in processed_urls: normalized_new_url = self._normalizeUrl(new_url)
next_level_urls.append(new_url) if normalized_new_url not in processed_urls:
new_urls_count += 1 next_level_urls.append(new_url)
new_urls_count += 1
else:
logger.debug(f"URL {new_url} (normalized: {normalized_new_url}) already processed, skipping")
logger.info(f"Added {new_urls_count} new URLs to next level from {url}") logger.info(f"Added {new_urls_count} new URLs to next level from {url}")
else: else:
logger.warning(f"✗ No content extracted from {url}") logger.warning(f"✗ No content extracted from {url}")
processed_urls.add(url) # Mark as processed to avoid retry processed_urls.add(normalized_url) # Mark as processed to avoid retry
except Exception as e: except Exception as e:
logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}") logger.warning(f"✗ Failed to process URL {url} at depth {depth}: {e}")
processed_urls.add(url) # Mark as processed to avoid retry processed_urls.add(normalized_url) # Mark as processed to avoid retry
# Prepare for next iteration # Prepare for next iteration
current_level_urls = next_level_urls current_level_urls = next_level_urls
logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level") logger.info(f"Depth {depth} completed. Found {len(next_level_urls)} URLs for next level")
# Stop if no more URLs to process # Stop if no more URLs to process
if not current_level_urls: if not current_level_urls:
logger.info(f"No more URLs found at depth {depth}, stopping recursion") logger.info(f"No more URLs found at depth {depth}, stopping recursion")
break break
logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled") logger.info(f"Recursive crawl completed: {len(all_content)} total pages crawled")
return all_content logger.info(f"Total URLs processed (including skipped): {len(processed_urls)}")
logger.info(f"Unique URLs found: {len(all_content)}")
return all_content
except asyncio.TimeoutError:
logger.warning(f"Crawling timed out, returning partial results: {len(all_content)} pages crawled so far")
return all_content
except Exception as e:
logger.error(f"Crawling failed with error: {e}, returning partial results: {len(all_content)} pages crawled so far")
return all_content
async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> str: async def webQuery(self, query: str, context: str = "", options: AiCallOptions = None) -> str:
"""Use Perplexity AI to provide the best answers for web-related queries.""" """Use Perplexity AI to provide the best answers for web-related queries."""

View file

@ -1052,8 +1052,11 @@ class ChatObjects:
def _storeDebugMessageAndDocuments(self, message: ChatMessage) -> None: def _storeDebugMessageAndDocuments(self, message: ChatMessage) -> None:
""" """
Store message and documents for debugging purposes in fileshare. Store message and documents (metadata and file bytes) for debugging purposes.
Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/documents Structure: gateway/test-chat/messages/m_round_task_action_timestamp/documentlist_label/
- message.json, message_text.txt
- document_###_metadata.json
- document_###_<original_filename> (actual file bytes)
Args: Args:
message: ChatMessage object to store message: ChatMessage object to store
@ -1157,6 +1160,26 @@ class ChatObjects:
logger.info(f"Debug: Stored document metadata for {doc.fileName}") logger.info(f"Debug: Stored document metadata for {doc.fileName}")
# Also store the actual file bytes next to metadata for debugging
try:
# Lazy import to avoid circular deps at module load
from modules.interfaces import interfaceDbComponentObjects as comp
componentInterface = comp.getInterface(self.currentUser)
file_bytes = componentInterface.getFileData(doc.fileId)
if file_bytes:
# Build a safe filename preserving original name
safe_name = doc.fileName or f"document_{i+1:03d}"
# Avoid path traversal
safe_name = os.path.basename(safe_name)
doc_file_path = os.path.join(label_folder, f"document_{i+1:03d}_" + safe_name)
with open(doc_file_path, "wb") as df:
df.write(file_bytes)
logger.info(f"Debug: Stored document file bytes: {doc_file_path} ({len(file_bytes)} bytes)")
else:
logger.warning(f"Debug: No file bytes returned for fileId {doc.fileId}")
except Exception as e:
logger.error(f"Debug: Failed to store document file for {doc.fileName} (fileId {doc.fileId}): {e}")
logger.info(f"Debug: Stored message and documents in {message_path}") logger.info(f"Debug: Stored message and documents in {message_path}")
except Exception as e: except Exception as e:

View file

@ -13,6 +13,7 @@ from modules.datamodels.datamodelWeb import (
WebSearchResultItem, WebSearchResultItem,
) )
from modules.interfaces.interfaceAiObjects import AiObjects from modules.interfaces.interfaceAiObjects import AiObjects
from modules.shared.configuration import APP_CONFIG
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -100,6 +101,9 @@ class AiService:
logger.info(f"User Query: {request.user_prompt}") logger.info(f"User Query: {request.user_prompt}")
logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}") logger.info(f"Max Results: {request.max_results}, Max Pages: {request.options.max_pages}")
# Global URL index to track all processed URLs across the entire research session
global_processed_urls = set()
# Step 1: Find relevant websites - either provided URLs or AI-determined main URLs # Step 1: Find relevant websites - either provided URLs or AI-determined main URLs
logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===") logger.info(f"=== STEP 1: INITIAL MAIN URLS LIST ===")
@ -129,7 +133,7 @@ class AiService:
Return ONLY this JSON format: Return ONLY this JSON format:
{{ {{
"user_prompt": "search query based on user query above", "user_prompt": "search query based on user query above",
"country": "country_code_or_null", "country": "Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries)",
"language": "language_code_or_null", "language": "language_code_or_null",
"topic": "general|news|academic_or_null", "topic": "general|news|academic_or_null",
"time_range": "d|w|m|y_or_null", "time_range": "d|w|m|y_or_null",
@ -194,10 +198,32 @@ class AiService:
} }
# Add parameters only if they have valid values # Add parameters only if they have valid values
if ai_country and ai_country not in ['null', '', 'none', 'undefined']: def _normalizeCountry(c: Optional[str]) -> Optional[str]:
search_kwargs["country"] = ai_country if not c:
elif request.options.country and request.options.country not in ['null', '', 'none', 'undefined']: return None
search_kwargs["country"] = request.options.country s = str(c).strip()
if not s or s.lower() in ['null', 'none', 'undefined']:
return None
# Map common codes to full English names when easy to do without extra deps
mapping = {
'ch': 'Switzerland', 'che': 'Switzerland',
'de': 'Germany', 'ger': 'Germany', 'deu': 'Germany',
'at': 'Austria', 'aut': 'Austria',
'us': 'United States', 'usa': 'United States', 'uni ted states': 'United States',
'uk': 'United Kingdom', 'gb': 'United Kingdom', 'gbr': 'United Kingdom'
}
key = s.lower()
if key in mapping:
return mapping[key]
# If looks like full name, capitalize first letter only (Tavily accepts English names)
return s
norm_ai_country = _normalizeCountry(ai_country)
norm_req_country = _normalizeCountry(request.options.country)
if norm_ai_country:
search_kwargs["country"] = norm_ai_country
elif norm_req_country:
search_kwargs["country"] = norm_req_country
if ai_language and ai_language not in ['null', '', 'none', 'undefined']: if ai_language and ai_language not in ['null', '', 'none', 'undefined']:
search_kwargs["language"] = ai_language search_kwargs["language"] = ai_language
@ -214,8 +240,35 @@ class AiService:
elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']: elif request.options.time_range and request.options.time_range in ['d', 'w', 'm', 'y']:
search_kwargs["time_range"] = request.options.time_range search_kwargs["time_range"] = request.options.time_range
# Constrain by expected domains if provided by AI
try:
include_domains = []
for p in expected_patterns or []:
if not isinstance(p, str):
continue
# Extract bare domain from pattern or URL
import re
m = re.search(r"(?:https?://)?([^/\s]+)", p.strip())
if m:
domain = m.group(1).lower()
# strip leading www.
if domain.startswith('www.'):
domain = domain[4:]
include_domains.append(domain)
# Deduplicate
if include_domains:
seen = set()
uniq = []
for d in include_domains:
if d not in seen:
seen.add(d)
uniq.append(d)
search_kwargs["include_domains"] = uniq
except Exception:
pass
# Log the parameters being used # Log the parameters being used
logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}") logger.info(f"Search parameters: country={search_kwargs.get('country', 'not_set')}, language={search_kwargs.get('language', 'not_set')}, topic={search_kwargs.get('topic', 'not_set')}, time_range={search_kwargs.get('time_range', 'not_set')}, include_domains={search_kwargs.get('include_domains', [])}")
search_results = await self.aiObjects.search_websites(**search_kwargs) search_results = await self.aiObjects.search_websites(**search_kwargs)
@ -232,6 +285,8 @@ class AiService:
seen.add(u) seen.add(u)
search_urls.append(u) search_urls.append(u)
logger.info(f"After initial deduplication: {len(search_urls)} unique URLs from {len(search_results)} search results")
if not search_urls: if not search_urls:
logger.error("No relevant websites found") logger.error("No relevant websites found")
return WebResearchActionResult(success=False, error="No relevant websites found") return WebResearchActionResult(success=False, error="No relevant websites found")
@ -281,6 +336,7 @@ class AiService:
unique_websites.append(url) unique_websites.append(url)
websites = unique_websites websites = unique_websites
logger.info(f"After AI selection deduplication: {len(websites)} unique URLs from {len(websites)} AI-selected URLs")
logger.info(f"AI selected {len(websites)} main URLs (after deduplication):") logger.info(f"AI selected {len(websites)} main URLs (after deduplication):")
for i, url in enumerate(websites, 1): for i, url in enumerate(websites, 1):
@ -305,18 +361,40 @@ class AiService:
logger.debug(f" {i}. {url}") logger.debug(f" {i}. {url}")
# Step 3+4+5: Recursive crawling with configurable depth # Step 3+4+5: Recursive crawling with configurable depth
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {request.options.pages_search_depth}) ===") # Get configuration parameters
max_depth = int(APP_CONFIG.get("Web_Research_MAX_DEPTH", "2"))
max_links_per_domain = int(APP_CONFIG.get("Web_Research_MAX_LINKS_PER_DOMAIN", "4"))
crawl_timeout_minutes = int(APP_CONFIG.get("Web_Research_CRAWL_TIMEOUT_MINUTES", "10"))
crawl_timeout_seconds = crawl_timeout_minutes * 60
# Use the configured max_depth or the request's pages_search_depth, whichever is smaller
effective_depth = min(max_depth, request.options.pages_search_depth)
logger.info(f"=== STEP 3+4+5: RECURSIVE CRAWLING (DEPTH {effective_depth}) ===")
logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...") logger.info(f"Starting recursive crawl of {len(selectedWebsites)} main websites...")
logger.info(f"Search depth: {request.options.pages_search_depth} levels") logger.info(f"Search depth: {effective_depth} levels (max configured: {max_depth})")
logger.info(f"DEBUG: request.options.pages_search_depth = {request.options.pages_search_depth}") logger.info(f"Max links per domain: {max_links_per_domain}")
logger.info(f"Crawl timeout: {crawl_timeout_minutes} minutes")
# Use recursive crawling with URL index to avoid duplicates # Use recursive crawling with URL index to avoid duplicates
allContent = await self.aiObjects.crawlRecursively( import asyncio
urls=selectedWebsites, try:
max_depth=request.options.pages_search_depth, allContent = await asyncio.wait_for(
extract_depth=request.options.extract_depth, self.aiObjects.crawlRecursively(
max_per_domain=10 urls=selectedWebsites,
) max_depth=effective_depth,
extract_depth=request.options.extract_depth,
max_per_domain=max_links_per_domain,
global_processed_urls=global_processed_urls
),
timeout=crawl_timeout_seconds
)
logger.info(f"Crawling completed within timeout: {len(allContent)} pages crawled")
except asyncio.TimeoutError:
logger.warning(f"Crawling timed out after {crawl_timeout_minutes} minutes, using partial results")
# crawlRecursively now handles timeouts gracefully and returns partial results
# Try to get the partial results that were collected
allContent = {}
if not allContent: if not allContent:
logger.error("Could not extract content from any websites") logger.error("Could not extract content from any websites")
@ -324,7 +402,7 @@ class AiService:
logger.info(f"=== WEB RESEARCH COMPLETED ===") logger.info(f"=== WEB RESEARCH COMPLETED ===")
logger.info(f"Successfully crawled {len(allContent)} URLs total") logger.info(f"Successfully crawled {len(allContent)} URLs total")
logger.info(f"Crawl depth: {request.options.pages_search_depth} levels") logger.info(f"Crawl depth: {effective_depth} levels")
# Create simple result with raw content # Create simple result with raw content
sources = [WebSearchResultItem(title=url, url=url) for url in selectedWebsites] sources = [WebSearchResultItem(title=url, url=url) for url in selectedWebsites]
@ -346,7 +424,10 @@ class AiService:
additional_links=additional_links, additional_links=additional_links,
individual_content=allContent, # Individual URL -> content mapping individual_content=allContent, # Individual URL -> content mapping
debug_info={ debug_info={
"crawl_depth": request.options.pages_search_depth, "crawl_depth": effective_depth,
"max_configured_depth": max_depth,
"max_links_per_domain": max_links_per_domain,
"crawl_timeout_minutes": crawl_timeout_minutes,
"total_urls_crawled": len(allContent), "total_urls_crawled": len(allContent),
"main_urls": len(selectedWebsites), "main_urls": len(selectedWebsites),
"additional_urls": len(additional_links) "additional_urls": len(additional_links)
@ -398,10 +479,18 @@ class AiService:
"mergeStrategy": { "mergeStrategy": {
"groupBy": "typeGroup", "groupBy": "typeGroup",
"orderBy": "id", "orderBy": "id",
"mergeType": "concatenate" "mergeType": "concatenate" # Default fallback
}, },
} }
# Override mergeStrategy if provided in options
if options and hasattr(options, 'mergeStrategy') and options.mergeStrategy:
extractionOptions["mergeStrategy"] = options.mergeStrategy
else:
# Set intelligent merge strategy for JSON and CSV based on outputFormat
# This is a fallback when mergeStrategy is not provided in options
pass # Keep default concatenate strategy
processedContents: List[str] = [] processedContents: List[str] = []
try: try:
@ -641,12 +730,21 @@ class AiService:
# Merge AI results using ExtractionService # Merge AI results using ExtractionService
from modules.datamodels.datamodelExtraction import MergeStrategy from modules.datamodels.datamodelExtraction import MergeStrategy
mergeStrategy = MergeStrategy( # Use mergeStrategy from options if available, otherwise default
groupBy="typeGroup", if options and hasattr(options, 'mergeStrategy') and options.mergeStrategy:
orderBy="id", mergeStrategy = MergeStrategy(
mergeType="concatenate", groupBy=options.mergeStrategy.get("groupBy", "typeGroup"),
chunkSeparator="\n\n---\n\n" orderBy=options.mergeStrategy.get("orderBy", "id"),
) mergeType=options.mergeStrategy.get("mergeType", "concatenate"),
chunkSeparator="\n\n---\n\n"
)
else:
mergeStrategy = MergeStrategy(
groupBy="typeGroup",
orderBy="id",
mergeType="concatenate",
chunkSeparator="\n\n---\n\n"
)
mergedContent = self.extractionService.mergeAiResults( mergedContent = self.extractionService.mergeAiResults(
extractionResult, extractionResult,

View file

@ -43,8 +43,9 @@ You are generating a document in {output_format.upper()} format for the title: "
Rules: Rules:
- The user's intent fully defines the structure. Do not assume a fixed template or headings. - The user's intent fully defines the structure. Do not assume a fixed template or headings.
- Use only factual information extracted from the supplied source documents. - Work with whatever data is available from the source documents - partial data is better than no data.
- Do not invent, hallucinate, or include placeholders (e.g., "lorem ipsum", "TBD"). - If some information is missing, create the best possible document with what you have available.
- Do not refuse to generate the document due to incomplete data - always proceed with available information.
- The output must strictly follow the target format and be ready for saving without extra wrapping. - The output must strictly follow the target format and be ready for saving without extra wrapping.
- At the VERY TOP output exactly one line with the filename header: - At the VERY TOP output exactly one line with the filename header:
FILENAME: <safe-file-name-with-extension> FILENAME: <safe-file-name-with-extension>
@ -55,7 +56,8 @@ Rules:
Common policy: Common policy:
- Use the actual data from the source documents to create the content. - Use the actual data from the source documents to create the content.
- Do not generate placeholder text or templates. - If data is incomplete, work with what you have and create a meaningful document.
- Always generate the document - never refuse due to missing information.
- Extract and use the real data provided in the source documents to create meaningful content. - Extract and use the real data provided in the source documents to create meaningful content.
""".strip() """.strip()

View file

@ -125,7 +125,9 @@ class WorkflowService:
break break
if not message_found: if not message_found:
logger.warning(f"Message with ID {message_id} not found in workflow. Available message IDs: {[str(msg.id) for msg in workflow.messages]}") available_ids = [str(msg.id) for msg in workflow.messages]
logger.error(f"Message with ID {message_id} not found in workflow. Available message IDs: {available_ids}")
raise ValueError(f"Document reference not found: docList:{message_id}:{label}")
elif len(parts) >= 2: elif len(parts) >= 2:
# Format: docList:<label> - find message by documentsLabel # Format: docList:<label> - find message by documentsLabel
label = parts[1] label = parts[1]
@ -154,7 +156,8 @@ class WorkflowService:
else: else:
logger.debug(f"Found docList reference {doc_ref} but message has no documents") logger.debug(f"Found docList reference {doc_ref} but message has no documents")
else: else:
logger.debug(f"No messages found with documentsLabel: {label}") logger.error(f"No messages found with documentsLabel: {label}")
raise ValueError(f"Document reference not found: docList:{label}")
else: else:
# Direct label reference (round1_task2_action3_contextinfo) # Direct label reference (round1_task2_action3_contextinfo)
# Search for messages with matching documentsLabel to find the actual documents # Search for messages with matching documentsLabel to find the actual documents
@ -198,30 +201,8 @@ class WorkflowService:
else: else:
logger.debug(f"No documents found in newest message {newest_message.id}") logger.debug(f"No documents found in newest message {newest_message.id}")
else: else:
logger.debug(f"No messages found with documentsLabel: {doc_ref}") logger.error(f"No messages found with documentsLabel: {doc_ref}")
# Fallback: also check if any message has this documentsLabel as a prefix raise ValueError(f"Document reference not found: {doc_ref}")
logger.debug(f"Trying fallback search for messages with documentsLabel containing: {doc_ref}")
fallback_messages = []
for message in workflow.messages:
msg_documents_label = getattr(message, 'documentsLabel', '')
if msg_documents_label and msg_documents_label.startswith(doc_ref):
fallback_messages.append(message)
logger.debug(f"Found fallback message {message.id} with documentsLabel: {msg_documents_label}")
if fallback_messages:
# Sort by publishedAt descending (newest first)
fallback_messages.sort(key=lambda msg: getattr(msg, 'publishedAt', 0), reverse=True)
newest_fallback = fallback_messages[0]
logger.debug(f"Using fallback message {newest_fallback.id} with documentsLabel: {getattr(newest_fallback, 'documentsLabel', 'unknown')}")
if newest_fallback.documents:
doc_names = [doc.fileName for doc in newest_fallback.documents if hasattr(doc, 'fileName')]
logger.debug(f"Added {len(newest_fallback.documents)} documents from fallback message {newest_fallback.id}: {doc_names}")
all_documents.extend(newest_fallback.documents)
else:
logger.debug(f"No documents found in fallback message {newest_fallback.id}")
else:
logger.debug(f"No fallback messages found either")
logger.debug(f"Resolved {len(all_documents)} documents from document list: {documentList}") logger.debug(f"Resolved {len(all_documents)} documents from document list: {documentList}")
return all_documents return all_documents
@ -622,6 +603,13 @@ class WorkflowService:
if not workflow or not hasattr(workflow, 'messages'): if not workflow or not hasattr(workflow, 'messages'):
return "No documents available" return "No documents available"
# Reload workflow from database to ensure we have all messages
if hasattr(workflow, 'id'):
try:
workflow = self.getWorkflow(workflow.id)
except Exception as e:
logger.warning(f"Could not reload workflow from database: {str(e)}")
# Get document reference list using the exact same logic as old system # Get document reference list using the exact same logic as old system
document_list = self._getDocumentReferenceList(workflow) document_list = self._getDocumentReferenceList(workflow)

View file

@ -169,12 +169,12 @@ class MethodAi(MethodBase):
Parameters: Parameters:
- user_prompt (str, required): Research question or topic. - user_prompt (str, required): Research question or topic.
- urls (list, optional): Specific URLs to crawl. - urls (list, optional): Specific URLs to crawl.
- max_results (int, optional): Max search results. Default: 10. - max_results (int, optional): Max search results. Default: 5.
- max_pages (int, optional): Max pages to crawl per site. Default: 10. - max_pages (int, optional): Max pages to crawl per site. Default: 5.
- search_depth (str, optional): basic | advanced. Default: basic. - search_depth (str, optional): basic | advanced. Default: basic.
- extract_depth (str, optional): basic | advanced. Default: advanced. - extract_depth (str, optional): basic | advanced. Default: advanced.
- pages_search_depth (int, optional): Crawl depth level. Default: 2. - pages_search_depth (int, optional): Crawl depth level. Default: 2.
- country (str, optional): Country code for bias. - country (str, optional): Full English country name (ISO-3166; map codes via pycountry/i18n-iso-countries).
- time_range (str, optional): d | w | m | y. - time_range (str, optional): d | w | m | y.
- topic (str, optional): general | news | academic. - topic (str, optional): general | news | academic.
- language (str, optional): Language code (e.g., de, en, fr). - language (str, optional): Language code (e.g., de, en, fr).
@ -182,8 +182,8 @@ class MethodAi(MethodBase):
try: try:
user_prompt = parameters.get("user_prompt") user_prompt = parameters.get("user_prompt")
urls = parameters.get("urls") urls = parameters.get("urls")
max_results = parameters.get("max_results", 10) max_results = parameters.get("max_results", 5)
max_pages = parameters.get("max_pages", 10) max_pages = parameters.get("max_pages", 5)
search_depth = parameters.get("search_depth", "basic") search_depth = parameters.get("search_depth", "basic")
extract_depth = parameters.get("extract_depth", "advanced") extract_depth = parameters.get("extract_depth", "advanced")
pages_search_depth = parameters.get("pages_search_depth", 2) pages_search_depth = parameters.get("pages_search_depth", 2)

View file

@ -42,9 +42,7 @@ class MethodDocument(MethodBase):
- operationType (str, optional): extract_content | analyze_document | summarize_content. Default: extract_content. - operationType (str, optional): extract_content | analyze_document | summarize_content. Default: extract_content.
- processDocumentsIndividually (bool, optional): Process each document separately. Default: True. - processDocumentsIndividually (bool, optional): Process each document separately. Default: True.
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True. - chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
- mergeStrategy (dict, optional): Merge strategy for chunked content. - outputMimeType (str, optional): MIME type for output file. Options: "text/plain" (default), "application/json", "text/csv", "text/html". Default: "text/plain".
- expectedDocumentFormats (list, optional): Desired output format specs.
- includeMetadata (bool, optional): Include file metadata. Default: True.
""" """
try: try:
documentList = parameters.get("documentList") documentList = parameters.get("documentList")
@ -54,13 +52,7 @@ class MethodDocument(MethodBase):
operationType = parameters.get("operationType", "extract_content") operationType = parameters.get("operationType", "extract_content")
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True) processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
chunkAllowed = parameters.get("chunkAllowed", True) chunkAllowed = parameters.get("chunkAllowed", True)
mergeStrategy = parameters.get("mergeStrategy", { outputMimeType = parameters.get("outputMimeType", "text/plain")
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
})
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
includeMetadata = parameters.get("includeMetadata", True)
if not documentList: if not documentList:
return ActionResult.isFailure( return ActionResult.isFailure(
@ -87,19 +79,16 @@ class MethodDocument(MethodBase):
compressContext=not chunkAllowed compressContext=not chunkAllowed
) )
# Add format instructions to prompt if expected formats are provided # Add format instructions to prompt based on MIME type
enhanced_prompt = prompt enhanced_prompt = prompt
if expectedDocumentFormats: mime_type_mapping = {
format_instructions = [] "text/plain": (".txt", "Plain text format"),
for fmt in expectedDocumentFormats: "application/json": (".json", "Structured JSON format"),
extension = fmt.get("extension", ".txt") "text/csv": (".csv", "Table format"),
mime_type = fmt.get("mimeType", "text/plain") "text/html": (".html", "HTML format")
description = fmt.get("description", "") }
format_instructions.append(f"- {extension} ({mime_type}): {description}") extension, description = mime_type_mapping.get(outputMimeType, (".txt", "Plain text format"))
enhanced_prompt += f"\n\nPlease format the output as {extension} ({outputMimeType}): {description}"
if format_instructions:
enhanced_prompt += f"\n\nPlease format the output as: {', '.join([fmt.get('extension', '.txt') for fmt in expectedDocumentFormats])}"
enhanced_prompt += f"\nExpected formats:\n" + "\n".join(format_instructions)
# Use enhanced AI service for extraction # Use enhanced AI service for extraction
ai_response = await self.services.ai.callAi( ai_response = await self.services.ai.callAi(
@ -125,8 +114,16 @@ class MethodDocument(MethodBase):
for i, chatDocument in enumerate(chatDocuments): for i, chatDocument in enumerate(chatDocuments):
# Use the AI response directly - it already contains processed content # Use the AI response directly - it already contains processed content
final_content = ai_response final_content = ai_response
final_mime_type = "text/plain"
final_extension = ".txt" # Determine output format based on MIME type
mime_type_mapping = {
"text/plain": ".txt",
"application/json": ".json",
"text/csv": ".csv",
"text/html": ".html"
}
final_extension = mime_type_mapping.get(outputMimeType, ".txt")
final_mime_type = outputMimeType
# Create meaningful output fileName with workflow context # Create meaningful output fileName with workflow context
original_fileName = chatDocument.fileName original_fileName = chatDocument.fileName
@ -175,8 +172,6 @@ class MethodDocument(MethodBase):
- operationType (str, optional): generate_report | analyze_documents. Default: generate_report. - operationType (str, optional): generate_report | analyze_documents. Default: generate_report.
- processDocumentsIndividually (bool, optional): Process per document. Default: True. - processDocumentsIndividually (bool, optional): Process per document. Default: True.
- chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True. - chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
- mergeStrategy (dict, optional): Merging rules for multi-part generation.
- includeMetadata (bool, optional): Include file metadata. Default: True.
""" """
try: try:
documentList = parameters.get("documentList") documentList = parameters.get("documentList")
@ -188,12 +183,6 @@ class MethodDocument(MethodBase):
operationType = parameters.get("operationType", "generate_report") operationType = parameters.get("operationType", "generate_report")
processDocumentsIndividually = parameters.get("processDocumentsIndividually", True) processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
chunkAllowed = parameters.get("chunkAllowed", True) chunkAllowed = parameters.get("chunkAllowed", True)
mergeStrategy = parameters.get("mergeStrategy", {
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
})
includeMetadata = parameters.get("includeMetadata", True)
if not documentList: if not documentList:
return ActionResult.isFailure( return ActionResult.isFailure(

View file

@ -154,7 +154,13 @@ class MethodOutlook(MethodBase):
if not query or not query.strip(): if not query or not query.strip():
# No query specified, just get emails from folder # No query specified, just get emails from folder
if folder and folder.lower() != "all": if folder and folder.lower() != "all":
params["$filter"] = f"parentFolderId eq '{folder}'" # Use folder name directly for well-known folders, or get folder ID
if folder.lower() in ["inbox", "drafts", "sentitems", "deleteditems"]:
params["$filter"] = f"parentFolderId eq '{folder}'"
else:
# For custom folders, we need to get the folder ID first
# This will be handled by the calling method
params["$filter"] = f"parentFolderId eq '{folder}'"
# Add orderby for basic queries # Add orderby for basic queries
params["$orderby"] = "receivedDateTime desc" params["$orderby"] = "receivedDateTime desc"
return params return params
@ -191,11 +197,21 @@ class MethodOutlook(MethodBase):
# Use only subject search to keep filter simple # Use only subject search to keep filter simple
params["$filter"] = f"contains(subject,'{clean_query}')" # Handle wildcard queries specially
if clean_query == "*" or clean_query == "":
# For wildcard or empty query, don't use contains filter
# Just use folder filter if specified
if folder and folder.lower() != "all":
params["$filter"] = f"parentFolderId eq '{folder}'"
else:
# No filter needed for wildcard search across all folders
pass
else:
params["$filter"] = f"contains(subject,'{clean_query}')"
# Add folder filter if specified # Add folder filter if specified
if folder and folder.lower() != "all": if folder and folder.lower() != "all":
params["$filter"] = f"{params['$filter']} and parentFolderId eq '{folder}'" params["$filter"] = f"{params['$filter']} and parentFolderId eq '{folder}'"
# Add orderby for basic queries # Add orderby for basic queries
params["$orderby"] = "receivedDateTime desc" params["$orderby"] = "receivedDateTime desc"
@ -300,26 +316,31 @@ class MethodOutlook(MethodBase):
""" """
GENERAL: GENERAL:
- Purpose: Read emails and metadata from a mailbox folder. - Purpose: Read emails and metadata from a mailbox folder.
- Input requirements: connectionReference (required); optional folder, limit, filter, expectedDocumentFormats. - Input requirements: connectionReference (required); optional folder, limit, filter, outputMimeType.
- Output format: JSON with emails and metadata. - Output format: JSON with emails and metadata.
Parameters: Parameters:
- connectionReference (str, required): Microsoft connection label. - connectionReference (str, required): Microsoft connection label.
- folder (str, optional): Folder to read from. Default: Inbox. - folder (str, optional): Folder to read from. Default: Inbox.
- limit (int, optional): Maximum items to return. Default: 10. - limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
- filter (str, optional): Sender, query operators, or subject text. - filter (str, optional): Sender, query operators, or subject text.
- expectedDocumentFormats (list, optional): Output format preferences. - outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
""" """
try: try:
connectionReference = parameters.get("connectionReference") connectionReference = parameters.get("connectionReference")
folder = parameters.get("folder", "Inbox") folder = parameters.get("folder", "Inbox")
limit = parameters.get("limit", 10) limit = parameters.get("limit", 10)
filter = parameters.get("filter") filter = parameters.get("filter")
expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) outputMimeType = parameters.get("outputMimeType", "application/json")
if not connectionReference: if not connectionReference:
return ActionResult.isFailure(error="Connection reference is required") return ActionResult.isFailure(error="Connection reference is required")
# Validate limit parameter
if limit <= 0:
limit = 1000
logger.warning(f"Invalid limit value ({limit}), using default value 1000")
# Validate filter parameter if provided # Validate filter parameter if provided
if filter: if filter:
# Remove any potentially dangerous characters that could break the filter # Remove any potentially dangerous characters that could break the filter
@ -343,8 +364,16 @@ class MethodOutlook(MethodBase):
"Content-Type": "application/json" "Content-Type": "application/json"
} }
# Build the API request # Get the folder ID for the specified folder
api_url = f"{graph_url}/me/mailFolders/{folder}/messages" folder_id = self._getFolderId(folder, connection)
if folder_id:
# Build the API request with folder ID
api_url = f"{graph_url}/me/mailFolders/{folder_id}/messages"
else:
# Fallback: use folder name directly (for well-known folders like "Inbox")
api_url = f"{graph_url}/me/mailFolders/{folder}/messages"
logger.warning(f"Could not find folder ID for '{folder}', using folder name directly")
params = { params = {
"$top": limit, "$top": limit,
"$orderby": "receivedDateTime desc" "$orderby": "receivedDateTime desc"
@ -380,7 +409,11 @@ class MethodOutlook(MethodBase):
"count": len(emails_data.get("value", [])), "count": len(emails_data.get("value", [])),
"folder": folder, "folder": folder,
"filter": filter, "filter": filter,
"apiResponse": emails_data "apiMetadata": {
"@odata.context": emails_data.get("@odata.context"),
"@odata.count": emails_data.get("@odata.count"),
"@odata.nextLink": emails_data.get("@odata.nextLink")
}
} }
@ -405,18 +438,15 @@ class MethodOutlook(MethodBase):
logger.error(f"Error reading emails from Microsoft Graph API: {str(e)}") logger.error(f"Error reading emails from Microsoft Graph API: {str(e)}")
return ActionResult.isFailure(error=f"Failed to read emails: {str(e)}") return ActionResult.isFailure(error=f"Failed to read emails: {str(e)}")
# Determine output format based on expected formats # Determine output format based on MIME type
output_extension = ".json" # Default mime_type_mapping = {
output_mime_type = "application/json" # Default "application/json": ".json",
"text/plain": ".txt",
if expectedDocumentFormats and len(expectedDocumentFormats) > 0: "text/csv": ".csv"
# Use the first expected format }
expected_format = expectedDocumentFormats[0] output_extension = mime_type_mapping.get(outputMimeType, ".json")
output_extension = expected_format.get("extension", ".json") output_mime_type = outputMimeType
output_mime_type = expected_format.get("mimeType", "application/json") logger.info(f"Using output format: {output_extension} ({output_mime_type})")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
@ -454,27 +484,32 @@ class MethodOutlook(MethodBase):
""" """
GENERAL: GENERAL:
- Purpose: Search emails by query and return matching items with metadata. - Purpose: Search emails by query and return matching items with metadata.
- Input requirements: connectionReference (required); query (required); optional folder, limit, expectedDocumentFormats. - Input requirements: connectionReference (required); query (required); optional folder, limit, outputMimeType.
- Output format: JSON with search results and metadata. - Output format: JSON with search results and metadata.
Parameters: Parameters:
- connectionReference (str, required): Microsoft connection label. - connectionReference (str, required): Microsoft connection label.
- query (str, required): Search expression. - query (str, required): Search expression.
- folder (str, optional): Folder scope or All. Default: All. - folder (str, optional): Folder scope or All. Default: All.
- limit (int, optional): Maximum items to return. Default: 20. - limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
- expectedDocumentFormats (list, optional): Output format preferences. - outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
""" """
try: try:
connectionReference = parameters.get("connectionReference") connectionReference = parameters.get("connectionReference")
query = parameters.get("query") query = parameters.get("query")
folder = parameters.get("folder", "All") folder = parameters.get("folder", "All")
limit = parameters.get("limit", 20) limit = parameters.get("limit", 1000)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) outputMimeType = parameters.get("outputMimeType", "application/json")
# Validate parameters # Validate parameters
if not connectionReference: if not connectionReference:
return ActionResult.isFailure(error="Connection reference is required") return ActionResult.isFailure(error="Connection reference is required")
# Validate limit parameter
if limit <= 0:
limit = 1000
logger.warning(f"Invalid limit value ({limit}), using default value 1000")
if not query or not query.strip(): if not query or not query.strip():
return ActionResult.isFailure(error="Search query is required and cannot be empty") return ActionResult.isFailure(error="Search query is required and cannot be empty")
@ -488,12 +523,15 @@ class MethodOutlook(MethodBase):
# Validate limit # Validate limit
try: try:
limit = int(limit) limit = int(limit)
if limit <= 0 or limit > 1000: # Microsoft Graph API has limits if limit <= 0:
limit = 20 limit = 1000
logger.warning(f"Limit {limit} is out of range, using default value 20") logger.warning(f"Invalid limit value (<=0), using default value 1000")
elif limit > 1000: # Microsoft Graph API has limits
limit = 1000
logger.warning(f"Limit {limit} exceeds maximum (1000), using 1000")
except (ValueError, TypeError): except (ValueError, TypeError):
limit = 20 limit = 1000
logger.warning(f"Invalid limit value, using default value 20") logger.warning(f"Invalid limit value, using default value 1000")
# Get Microsoft connection # Get Microsoft connection
connection = self._getMicrosoftConnection(connectionReference) connection = self._getMicrosoftConnection(connectionReference)
@ -509,9 +547,18 @@ class MethodOutlook(MethodBase):
"Content-Type": "application/json" "Content-Type": "application/json"
} }
# Get the folder ID for the specified folder if needed
folder_id = None
if folder and folder.lower() != "all":
folder_id = self._getFolderId(folder, connection)
if folder_id:
logger.debug(f"Found folder ID for '{folder}': {folder_id}")
else:
logger.warning(f"Could not find folder ID for '{folder}', using folder name directly")
# Build the search API request # Build the search API request
api_url = f"{graph_url}/me/messages" api_url = f"{graph_url}/me/messages"
params = self._buildSearchParameters(query, folder, limit) params = self._buildSearchParameters(query, folder_id or folder, limit)
# Log search parameters for debugging # Log search parameters for debugging
logger.debug(f"Search query: '{query}'") logger.debug(f"Search query: '{query}'")
@ -605,7 +652,11 @@ class MethodOutlook(MethodBase):
"count": len(emails), "count": len(emails),
"folder": folder, "folder": folder,
"limit": limit, "limit": limit,
"apiResponse": search_data, "apiMetadata": {
"@odata.context": search_data.get("@odata.context"),
"@odata.count": search_data.get("@odata.count"),
"@odata.nextLink": search_data.get("@odata.nextLink")
},
"searchParams": params "searchParams": params
} }
@ -618,18 +669,15 @@ class MethodOutlook(MethodBase):
logger.error(f"Error searching emails via Microsoft Graph API: {str(e)}") logger.error(f"Error searching emails via Microsoft Graph API: {str(e)}")
return ActionResult.isFailure(error=f"Failed to search emails: {str(e)}") return ActionResult.isFailure(error=f"Failed to search emails: {str(e)}")
# Determine output format based on expected formats # Determine output format based on MIME type
output_extension = ".json" # Default mime_type_mapping = {
output_mime_type = "application/json" # Default "application/json": ".json",
"text/plain": ".txt",
if expectedDocumentFormats and len(expectedDocumentFormats) > 0: "text/csv": ".csv"
# Use the first expected format }
expected_format = expectedDocumentFormats[0] output_extension = mime_type_mapping.get(outputMimeType, ".json")
output_extension = expected_format.get("extension", ".json") output_mime_type = outputMimeType
output_mime_type = expected_format.get("mimeType", "application/json") logger.info(f"Using output format: {output_extension} ({output_mime_type})")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
@ -664,20 +712,20 @@ class MethodOutlook(MethodBase):
""" """
GENERAL: GENERAL:
- Purpose: List draft emails from a folder. - Purpose: List draft emails from a folder.
- Input requirements: connectionReference (required); optional folder, limit, expectedDocumentFormats. - Input requirements: connectionReference (required); optional folder, limit, outputMimeType.
- Output format: JSON with draft items and metadata. - Output format: JSON with draft items and metadata.
Parameters: Parameters:
- connectionReference (str, required): Microsoft connection label. - connectionReference (str, required): Microsoft connection label.
- folder (str, optional): Drafts folder to list. Default: Drafts. - folder (str, optional): Drafts folder to list. Default: Drafts.
- limit (int, optional): Maximum items to return. Default: 20. - limit (int, optional): Maximum items to return. Must be > 0. Default: 1000.
- expectedDocumentFormats (list, optional): Output format preferences. - outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
""" """
try: try:
connectionReference = parameters.get("connectionReference") connectionReference = parameters.get("connectionReference")
folder = parameters.get("folder", "Drafts") folder = parameters.get("folder", "Drafts")
limit = parameters.get("limit", 20) limit = parameters.get("limit", 1000)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) outputMimeType = parameters.get("outputMimeType", "application/json")
if not connectionReference: if not connectionReference:
return ActionResult.isFailure(error="Connection reference is required") return ActionResult.isFailure(error="Connection reference is required")
@ -745,18 +793,15 @@ class MethodOutlook(MethodBase):
logger.error(f"Error listing drafts via Microsoft Graph API: {str(e)}") logger.error(f"Error listing drafts via Microsoft Graph API: {str(e)}")
return ActionResult.isFailure(error=f"Failed to list drafts: {str(e)}") return ActionResult.isFailure(error=f"Failed to list drafts: {str(e)}")
# Determine output format based on expected formats # Determine output format based on MIME type
output_extension = ".json" # Default mime_type_mapping = {
output_mime_type = "application/json" # Default "application/json": ".json",
"text/plain": ".txt",
if expectedDocumentFormats and len(expectedDocumentFormats) > 0: "text/csv": ".csv"
# Use the first expected format }
expected_format = expectedDocumentFormats[0] output_extension = mime_type_mapping.get(outputMimeType, ".json")
output_extension = expected_format.get("extension", ".json") output_mime_type = outputMimeType
output_mime_type = expected_format.get("mimeType", "application/json") logger.info(f"Using output format: {output_extension} ({output_mime_type})")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
@ -790,18 +835,18 @@ class MethodOutlook(MethodBase):
""" """
GENERAL: GENERAL:
- Purpose: Find draft emails across folders. - Purpose: Find draft emails across folders.
- Input requirements: connectionReference (required); optional limit, expectedDocumentFormats. - Input requirements: connectionReference (required); optional limit, outputMimeType.
- Output format: JSON with drafts and metadata. - Output format: JSON with drafts and metadata.
Parameters: Parameters:
- connectionReference (str, required): Microsoft connection label. - connectionReference (str, required): Microsoft connection label.
- limit (int, optional): Maximum items to return. Default: 50. - limit (int, optional): Maximum items to return. Default: 50.
- expectedDocumentFormats (list, optional): Output format preferences. - outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
""" """
try: try:
connectionReference = parameters.get("connectionReference") connectionReference = parameters.get("connectionReference")
limit = parameters.get("limit", 50) limit = parameters.get("limit", 50)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) outputMimeType = parameters.get("outputMimeType", "application/json")
if not connectionReference: if not connectionReference:
return ActionResult.isFailure(error="Connection reference is required") return ActionResult.isFailure(error="Connection reference is required")
@ -859,18 +904,15 @@ class MethodOutlook(MethodBase):
logger.error(f"Error finding drafts via Microsoft Graph API: {str(e)}") logger.error(f"Error finding drafts via Microsoft Graph API: {str(e)}")
return ActionResult.isFailure(error=f"Failed to find drafts: {str(e)}") return ActionResult.isFailure(error=f"Failed to find drafts: {str(e)}")
# Determine output format based on expected formats # Determine output format based on MIME type
output_extension = ".json" # Default mime_type_mapping = {
output_mime_type = "application/json" # Default "application/json": ".json",
"text/plain": ".txt",
if expectedDocumentFormats and len(expectedDocumentFormats) > 0: "text/csv": ".csv"
# Use the first expected format }
expected_format = expectedDocumentFormats[0] output_extension = mime_type_mapping.get(outputMimeType, ".json")
output_extension = expected_format.get("extension", ".json") output_mime_type = outputMimeType
output_mime_type = expected_format.get("mimeType", "application/json") logger.info(f"Using output format: {output_extension} ({output_mime_type})")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
@ -930,18 +972,18 @@ class MethodOutlook(MethodBase):
""" """
GENERAL: GENERAL:
- Purpose: Check contents of the Drafts folder. - Purpose: Check contents of the Drafts folder.
- Input requirements: connectionReference (required); optional limit, expectedDocumentFormats. - Input requirements: connectionReference (required); optional limit, outputMimeType.
- Output format: JSON with drafts and metadata. - Output format: JSON with drafts and metadata.
Parameters: Parameters:
- connectionReference (str, required): Microsoft connection label. - connectionReference (str, required): Microsoft connection label.
- limit (int, optional): Maximum items to return. Default: 20. - limit (int, optional): Maximum items to return. Default: 20.
- expectedDocumentFormats (list, optional): Output format preferences. - outputMimeType (str, optional): MIME type for output file. Options: "application/json" (default), "text/plain", "text/csv". Default: "application/json".
""" """
try: try:
connectionReference = parameters.get("connectionReference") connectionReference = parameters.get("connectionReference")
limit = parameters.get("limit", 20) limit = parameters.get("limit", 20)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) outputMimeType = parameters.get("outputMimeType", "application/json")
if not connectionReference: if not connectionReference:
return ActionResult.isFailure(error="Connection reference is required") return ActionResult.isFailure(error="Connection reference is required")
@ -1003,18 +1045,15 @@ class MethodOutlook(MethodBase):
logger.error(f"Error checking Drafts folder via Microsoft Graph API: {str(e)}") logger.error(f"Error checking Drafts folder via Microsoft Graph API: {str(e)}")
return ActionResult.isFailure(error=f"Failed to check Drafts folder: {str(e)}") return ActionResult.isFailure(error=f"Failed to check Drafts folder: {str(e)}")
# Determine output format based on expected formats # Determine output format based on MIME type
output_extension = ".json" # Default mime_type_mapping = {
output_mime_type = "application/json" # Default "application/json": ".json",
"text/plain": ".txt",
if expectedDocumentFormats and len(expectedDocumentFormats) > 0: "text/csv": ".csv"
# Use the first expected format }
expected_format = expectedDocumentFormats[0] output_extension = mime_type_mapping.get(outputMimeType, ".json")
output_extension = expected_format.get("extension", ".json") output_mime_type = outputMimeType
output_mime_type = expected_format.get("mimeType", "application/json") logger.info(f"Using output format: {output_extension} ({output_mime_type})")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")

View file

@ -49,13 +49,14 @@ class ReactMode(BaseMode):
"""Execute task using React mode - iterative plan-act-observe-refine loop""" """Execute task using React mode - iterative plan-act-observe-refine loop"""
logger.info(f"=== STARTING TASK {taskIndex or '?'}: {taskStep.objective} ===") logger.info(f"=== STARTING TASK {taskIndex or '?'}: {taskStep.objective} ===")
# NEW: Analyze user intent with both original prompt and task objective # NEW: Analyze intents separately for proper validation vs task completion
# Get original user prompt from services (clean and reliable) # Workflow-level intent from cleaned original user prompt
original_prompt = self.services.currentUserPrompt if self.services and hasattr(self.services, 'currentUserPrompt') else taskStep.objective original_prompt = self.services.currentUserPrompt if self.services and hasattr(self.services, 'currentUserPrompt') else taskStep.objective
combined_context = f"Original request: {original_prompt}\n\nCurrent task: {taskStep.objective}" self.workflowIntent = self.intentAnalyzer.analyzeUserIntent(original_prompt, context)
# Task-level intent from current task objective (used only for task-scoped checks)
self.currentIntent = self.intentAnalyzer.analyzeUserIntent(combined_context, context) self.taskIntent = self.intentAnalyzer.analyzeUserIntent(taskStep.objective, context)
logger.info(f"Intent analysis (original + task): {self.currentIntent}") logger.info(f"Intent analysis — workflow: {self.workflowIntent}")
logger.info(f"Intent analysis — task: {self.taskIntent}")
# NEW: Reset progress tracking for new task # NEW: Reset progress tracking for new task
self.progressTracker.reset() self.progressTracker.reset()
@ -99,18 +100,18 @@ class ReactMode(BaseMode):
# Attach deterministic label for clarity # Attach deterministic label for clarity
observation['resultLabel'] = result.resultLabel observation['resultLabel'] = result.resultLabel
# NEW: Add content validation # NEW: Add content validation (against original cleaned user prompt / workflow intent)
if self.currentIntent and result.documents: if getattr(self, 'workflowIntent', None) and result.documents:
validationResult = self.contentValidator.validateContent(result.documents, self.currentIntent) validationResult = self.contentValidator.validateContent(result.documents, self.workflowIntent)
observation['contentValidation'] = validationResult observation['contentValidation'] = validationResult
logger.info(f"Content validation: {validationResult['overallSuccess']} (quality: {validationResult['qualityScore']:.2f})") logger.info(f"Content validation: {validationResult['overallSuccess']} (quality: {validationResult['qualityScore']:.2f})")
# NEW: Learn from feedback # NEW: Learn from feedback
feedback = self._collectFeedback(result, validationResult, self.currentIntent) feedback = self._collectFeedback(result, validationResult, self.workflowIntent)
self.learningEngine.learnFromFeedback(feedback, context, self.currentIntent) self.learningEngine.learnFromFeedback(feedback, context, self.workflowIntent)
# NEW: Update progress # NEW: Update progress
self.progressTracker.updateProgress(result, validationResult, self.currentIntent) self.progressTracker.updateProgress(result, validationResult, self.workflowIntent)
decision = await self._refineDecide(context, observation) decision = await self._refineDecide(context, observation)
@ -204,6 +205,11 @@ class ReactMode(BaseMode):
selection = json.loads(response[jsonStart:jsonEnd]) selection = json.loads(response[jsonStart:jsonEnd])
if 'action' not in selection or not isinstance(selection['action'], str): if 'action' not in selection or not isinstance(selection['action'], str):
raise ValueError("Selection missing 'action' as string") raise ValueError("Selection missing 'action' as string")
# Validate document references - prevent AI from inventing Message IDs
if 'requiredInputDocuments' in selection:
self._validateDocumentReferences(selection['requiredInputDocuments'], context)
# Enforce spec: Stage 1 must NOT include 'parameters' # Enforce spec: Stage 1 must NOT include 'parameters'
if 'parameters' in selection: if 'parameters' in selection:
# Remove to avoid accidental carryover # Remove to avoid accidental carryover
@ -213,6 +219,38 @@ class ReactMode(BaseMode):
selection['parameters'] = None selection['parameters'] = None
return selection return selection
def _validateDocumentReferences(self, document_refs: List[str], context: TaskContext) -> None:
"""Validate that document references exist in the current workflow"""
if not document_refs:
return
# Get available documents from the current workflow
try:
available_docs = self.services.workflow.getAvailableDocuments(context.workflow)
if not available_docs or available_docs == "No documents available":
logger.warning("No documents available for validation")
return
# Extract all valid references from available documents
valid_refs = []
for line in available_docs.split('\n'):
if 'docList:' in line or 'docItem:' in line:
# Extract reference from line like " - docList:msg_xxx:label"
ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+)', line)
if ref_match:
valid_refs.append(ref_match.group(1))
# Check if all provided references are valid
for ref in document_refs:
if ref not in valid_refs:
logger.error(f"Invalid document reference: {ref}")
logger.error(f"Available references: {valid_refs}")
raise ValueError(f"Document reference '{ref}' not found in available documents. Use only exact references from AVAILABLE_DOCUMENTS_INDEX.")
except Exception as e:
logger.error(f"Error validating document references: {str(e)}")
raise ValueError(f"Failed to validate document references: {str(e)}")
async def _actExecute(self, context: TaskContext, selection: Dict[str, Any], taskStep: TaskStep, async def _actExecute(self, context: TaskContext, selection: Dict[str, Any], taskStep: TaskStep,
workflow: ChatWorkflow, stepIndex: int) -> ActionResult: workflow: ChatWorkflow, stepIndex: int) -> ActionResult:
"""Act: request minimal parameters then execute selected action""" """Act: request minimal parameters then execute selected action"""

View file

@ -72,6 +72,9 @@ RULES:
3. parametersContext must be short and sufficient for Stage 2 3. parametersContext must be short and sufficient for Stage 2
4. Return ONLY JSON - no markdown, no explanations 4. Return ONLY JSON - no markdown, no explanations
5. For requiredInputDocuments, use ONLY exact references from AVAILABLE_DOCUMENTS_INDEX (docList:... or docItem:...) 5. For requiredInputDocuments, use ONLY exact references from AVAILABLE_DOCUMENTS_INDEX (docList:... or docItem:...)
- DO NOT invent or modify Message IDs
- DO NOT create new references
- Copy references EXACTLY as shown in AVAILABLE_DOCUMENTS_INDEX
6. For requiredConnection, use ONLY an exact label from AVAILABLE_CONNECTIONS_INDEX 6. For requiredConnection, use ONLY an exact label from AVAILABLE_CONNECTIONS_INDEX
""" """