feat: switch to single file approach

This commit is contained in:
Christopher Gondek 2025-09-01 11:21:37 +02:00
parent 0816e7c45c
commit 4c3592d7d9
3 changed files with 261 additions and 56 deletions

View file

@ -7,16 +7,19 @@ from modules.interfaces.interface_web_model import (
WebCrawlBase, WebCrawlBase,
WebCrawlDocumentData, WebCrawlDocumentData,
WebCrawlRequest, WebCrawlRequest,
WebCrawlResultItem,
WebScrapeActionDocument, WebScrapeActionDocument,
WebScrapeActionResult, WebScrapeActionResult,
WebScrapeBase, WebScrapeBase,
WebScrapeDocumentData, WebScrapeDocumentData,
WebScrapeRequest, WebScrapeRequest,
WebScrapeResultItem,
WebSearchBase, WebSearchBase,
WebSearchRequest, WebSearchRequest,
WebSearchActionResult, WebSearchActionResult,
WebSearchActionDocument, WebSearchActionDocument,
WebSearchDocumentData, WebSearchDocumentData,
WebSearchResultItem,
WebCrawlActionDocument, WebCrawlActionDocument,
WebCrawlActionResult, WebCrawlActionResult,
) )
@ -62,7 +65,7 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
# Step 2: Build ActionResult # Step 2: Build ActionResult
try: try:
result = self._build_search_action_result(search_results) result = self._build_search_action_result(search_results, request.query)
except Exception as e: except Exception as e:
return WebSearchActionResult(success=False, error=str(e)) return WebSearchActionResult(success=False, error=str(e))
@ -78,7 +81,7 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
# Step 2: Build ActionResult # Step 2: Build ActionResult
try: try:
result = self._build_crawl_action_result(crawl_results) result = self._build_crawl_action_result(crawl_results, request.urls)
except Exception as e: except Exception as e:
return WebCrawlActionResult(success=False, error=str(e)) return WebCrawlActionResult(success=False, error=str(e))
@ -101,7 +104,7 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
# Step 3: Build ActionResult # Step 3: Build ActionResult
try: try:
result = self._build_scrape_action_result(crawl_results) result = self._build_scrape_action_result(crawl_results, request.query)
except Exception as e: except Exception as e:
return WebScrapeActionResult(success=False, error=str(e)) return WebScrapeActionResult(success=False, error=str(e))
@ -124,26 +127,32 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
] ]
def _build_search_action_result( def _build_search_action_result(
self, search_results: list[TavilySearchResult] self, search_results: list[TavilySearchResult], query: str = ""
) -> WebSearchActionResult: ) -> WebSearchActionResult:
"""Builds the ActionResult from the search results.""" """Builds the ActionResult from the search results."""
documents = [] # Convert to result items
for result in search_results: result_items = [
document_name = f"web_search_{get_utc_timestamp()}.txt" WebSearchResultItem(title=result.title, url=result.url)
document_data = WebSearchDocumentData(title=result.title, url=result.url) for result in search_results
mime_type = "application/json" ]
doc = WebSearchActionDocument(
documentName=document_name,
documentData=document_data,
mimeType=mime_type,
)
documents.append(doc)
return WebSearchActionResult( # Create document data with all results
success=True, documents=documents, resultLabel="web_search_results" document_data = WebSearchDocumentData(
query=query, results=result_items, total_count=len(result_items)
) )
async def _crawl(self, urls: list) -> list[str]: # Create single document
document = WebSearchActionDocument(
documentName=f"web_search_results_{get_utc_timestamp()}.json",
documentData=document_data,
mimeType="application/json",
)
return WebSearchActionResult(
success=True, documents=[document], resultLabel="web_search_results"
)
async def _crawl(self, urls: list) -> list[TavilyCrawlResult]:
"""Calls the Tavily API to extract text content from URLs.""" """Calls the Tavily API to extract text content from URLs."""
response = await self.client.extract( response = await self.client.extract(
urls=urls, extract_depth="advanced", format="text" urls=urls, extract_depth="advanced", format="text"
@ -154,43 +163,57 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
] ]
def _build_crawl_action_result( def _build_crawl_action_result(
self, crawl_results: list[TavilyCrawlResult] self, crawl_results: list[TavilyCrawlResult], urls: list[str] = None
) -> WebCrawlActionResult: ) -> WebCrawlActionResult:
"""Builds the ActionResult from the crawl results.""" """Builds the ActionResult from the crawl results."""
documents = [] # Convert to result items
for result in crawl_results: result_items = [
document_name = f"web_crawl_{get_utc_timestamp()}.txt" WebCrawlResultItem(url=result.url, content=result.content)
doc_data = WebCrawlDocumentData( for result in crawl_results
url=result["url"], content=result["raw_content"] ]
)
mime_type = "application/json" # Create document data with all results
doc = WebCrawlActionDocument( document_data = WebCrawlDocumentData(
documentName=document_name, urls=urls or [result.url for result in crawl_results],
documentData=doc_data, results=result_items,
mimeType=mime_type, total_count=len(result_items),
) )
documents.append(doc)
# Create single document
document = WebCrawlActionDocument(
documentName=f"web_crawl_results_{get_utc_timestamp()}.json",
documentData=document_data,
mimeType="application/json",
)
return WebCrawlActionResult( return WebCrawlActionResult(
success=True, documents=documents, resultLabel="web_crawl_results" success=True, documents=[document], resultLabel="web_crawl_results"
) )
def _build_scrape_action_result( def _build_scrape_action_result(
self, crawl_results: list[TavilyCrawlResult] self, crawl_results: list[TavilyCrawlResult], query: str = ""
) -> WebScrapeActionResult: ) -> WebScrapeActionResult:
"""Builds the ActionResult from the scrape results.""" """Builds the ActionResult from the scrape results."""
documents = [] # Convert to result items
for result in crawl_results: result_items = [
document_name = f"web_scrape_{get_utc_timestamp()}.txt" WebScrapeResultItem(url=result.url, content=result.content)
doc_data = WebScrapeDocumentData(url=result.url, content=result.content) for result in crawl_results
mime_type = "application/json" ]
doc = WebScrapeActionDocument(
documentName=document_name, # Create document data with all results
documentData=doc_data, document_data = WebScrapeDocumentData(
mimeType=mime_type, query=query,
) results=result_items,
documents.append(doc) total_count=len(result_items),
)
# Create single document
document = WebScrapeActionDocument(
documentName=f"web_scrape_results_{get_utc_timestamp()}.json",
documentData=document_data,
mimeType="application/json",
)
return WebScrapeActionResult( return WebScrapeActionResult(
success=True, documents=documents, resultLabel="web_scrape_results" success=True, documents=[document], resultLabel="web_scrape_results"
) )

View file

@ -2,8 +2,6 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from modules.interfaces.interfaceChatModel import ActionDocument, ActionResult from modules.interfaces.interfaceChatModel import ActionDocument, ActionResult
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import List from typing import List
@ -18,11 +16,21 @@ class WebSearchRequest(BaseModel):
max_results: int max_results: int
class WebSearchDocumentData(BaseModel): class WebSearchResultItem(BaseModel):
"""Individual search result"""
title: str title: str
url: str url: str
class WebSearchDocumentData(BaseModel):
"""Complete search results document"""
query: str
results: List[WebSearchResultItem]
total_count: int
class WebSearchActionDocument(ActionDocument): class WebSearchActionDocument(ActionDocument):
documentData: WebSearchDocumentData documentData: WebSearchDocumentData
@ -45,14 +53,24 @@ class WebCrawlRequest(BaseModel):
urls: List[str] urls: List[str]
class WebCrawlDocumentData(BaseModel): class WebCrawlResultItem(BaseModel):
"""Individual crawl result"""
url: str url: str
content: str content: str
class WebCrawlDocumentData(BaseModel):
"""Complete crawl results document"""
urls: List[str]
results: List[WebCrawlResultItem]
total_count: int
class WebCrawlActionDocument(ActionDocument): class WebCrawlActionDocument(ActionDocument):
documentData: WebCrawlDocumentData = Field( documentData: WebCrawlDocumentData = Field(
description="The data extracted from a single crawled URL" description="The data extracted from crawled URLs"
) )
@ -75,14 +93,24 @@ class WebScrapeRequest(BaseModel):
max_results: int max_results: int
class WebScrapeDocumentData(BaseModel): class WebScrapeResultItem(BaseModel):
"""Individual scrape result"""
url: str url: str
content: str content: str
class WebScrapeDocumentData(BaseModel):
"""Complete scrape results document"""
query: str
results: List[WebScrapeResultItem]
total_count: int
class WebScrapeActionDocument(ActionDocument): class WebScrapeActionDocument(ActionDocument):
documentData: WebScrapeDocumentData = Field( documentData: WebScrapeDocumentData = Field(
description="The data extracted from a single scraped URL" description="The data extracted from scraped URLs"
) )

View file

@ -3,7 +3,11 @@ from typing import Any, Dict
from modules.chat.methodBase import MethodBase, action from modules.chat.methodBase import MethodBase, action
from modules.interfaces.interfaceChatModel import ActionResult from modules.interfaces.interfaceChatModel import ActionResult
from modules.interfaces.interface_web_objects import WebInterface from modules.interfaces.interface_web_objects import WebInterface
from modules.interfaces.interface_web_model import WebSearchRequest from modules.interfaces.interface_web_model import (
WebSearchRequest,
WebCrawlRequest,
WebScrapeRequest,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -14,11 +18,14 @@ class MethodWeb(MethodBase):
def __init__(self, serviceCenter: Any): def __init__(self, serviceCenter: Any):
super().__init__(serviceCenter) super().__init__(serviceCenter)
self.name = "web"
self.description = "Web search, crawling, and scraping operations using Tavily"
@action @action
async def search(self, parameters: Dict[str, Any]) -> ActionResult: async def search(self, parameters: Dict[str, Any]) -> ActionResult:
""" """Perform a web search and outputs a .json file with a list of found URLs.
Perform a web search and output a .txt file with a plain list of URLs (one per line).
Each result contains "title" and "url".
Parameters: Parameters:
query (str): Search query to perform query (str): Search query to perform
@ -41,3 +48,150 @@ class MethodWeb(MethodBase):
except Exception as e: except Exception as e:
return ActionResult(success=False, error=str(e)) return ActionResult(success=False, error=str(e))
@action
async def crawl(self, parameters: Dict[str, Any]) -> ActionResult:
"""Crawls a list of URLs and extracts information from them.
Parameters:
document (str): Document reference containing URL list from search results
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
document_ref = parameters.get("document")
if not document_ref:
return ActionResult(
success=False, error="No document reference provided."
)
# Resolve document reference to ChatDocument objects
chat_documents = self.service.getChatDocumentsFromDocumentList(
[document_ref]
)
if not chat_documents:
return ActionResult(
success=False,
error=f"No documents found for reference: {document_ref}",
)
# Get the first document (search results)
search_doc = chat_documents[0]
# Get file data using the service center
file_data = self.service.getFileData(search_doc.fileId)
if not file_data:
return ActionResult(
success=False, error="Could not retrieve file data for document"
)
content = file_data.decode("utf-8")
# Parse JSON to extract URLs from search results
import json
try:
# The document structure from WebSearchActionResult
search_data = json.loads(content)
# Extract URLs from the search results structure
urls = []
if isinstance(search_data, dict):
# Handle the document structure: documentData contains the actual search results
doc_data = search_data.get("documentData", search_data)
if "results" in doc_data and isinstance(doc_data["results"], list):
urls = [
result["url"]
for result in doc_data["results"]
if isinstance(result, dict) and "url" in result
]
elif "urls" in doc_data and isinstance(doc_data["urls"], list):
# Fallback: if URLs are stored directly in a 'urls' field
urls = [url for url in doc_data["urls"] if isinstance(url, str)]
# Fallback: try to parse as plain text with regex (for backward compatibility)
if not urls:
logger.warning(
"Could not extract URLs from JSON structure, trying plain text parsing"
)
import re
urls = re.split(r"[\n,;]+", content)
urls = [
u.strip()
for u in urls
if u.strip()
and (
u.strip().startswith("http://")
or u.strip().startswith("https://")
)
]
except json.JSONDecodeError:
# Fallback to plain text parsing if JSON parsing fails
logger.warning("Document is not valid JSON, trying plain text parsing")
import re
urls = re.split(r"[\n,;]+", content)
urls = [
u.strip()
for u in urls
if u.strip()
and (
u.strip().startswith("http://")
or u.strip().startswith("https://")
)
]
if not urls:
return ActionResult(
success=False, error="No valid URLs found in the document."
)
logger.info(f"Extracted {len(urls)} URLs from document: {urls}")
# Prepare request data
web_crawl_request = WebCrawlRequest(urls=urls)
# Perform request
web_interface = await WebInterface.create()
web_crawl_result = await web_interface.crawl(web_crawl_request)
return web_crawl_result
except Exception as e:
logger.error(f"Error in crawl method: {str(e)}")
return ActionResult(success=False, error=str(e))
@action
async def scrape(self, parameters: Dict[str, Any]) -> ActionResult:
"""Scrapes web content by searching for URLs and then extracting their content.
Combines search and crawl operations in one step.
Parameters:
query (str): Search query to perform
maxResults (int, optional): Maximum number of results (default: 10)
"""
try:
query = parameters.get("query")
max_results = parameters.get("maxResults", 10)
if not query:
return ActionResult(success=False, error="Search query is required")
# Prepare request data
web_scrape_request = WebScrapeRequest(
query=query,
max_results=max_results,
)
# Perform request
web_interface = await WebInterface.create()
web_scrape_result = await web_interface.scrape(web_scrape_request)
return web_scrape_result
except Exception as e:
return ActionResult(success=False, error=str(e))