feat: switch to single file approach
This commit is contained in:
parent
0816e7c45c
commit
4c3592d7d9
3 changed files with 261 additions and 56 deletions
|
|
@ -7,16 +7,19 @@ from modules.interfaces.interface_web_model import (
|
||||||
WebCrawlBase,
|
WebCrawlBase,
|
||||||
WebCrawlDocumentData,
|
WebCrawlDocumentData,
|
||||||
WebCrawlRequest,
|
WebCrawlRequest,
|
||||||
|
WebCrawlResultItem,
|
||||||
WebScrapeActionDocument,
|
WebScrapeActionDocument,
|
||||||
WebScrapeActionResult,
|
WebScrapeActionResult,
|
||||||
WebScrapeBase,
|
WebScrapeBase,
|
||||||
WebScrapeDocumentData,
|
WebScrapeDocumentData,
|
||||||
WebScrapeRequest,
|
WebScrapeRequest,
|
||||||
|
WebScrapeResultItem,
|
||||||
WebSearchBase,
|
WebSearchBase,
|
||||||
WebSearchRequest,
|
WebSearchRequest,
|
||||||
WebSearchActionResult,
|
WebSearchActionResult,
|
||||||
WebSearchActionDocument,
|
WebSearchActionDocument,
|
||||||
WebSearchDocumentData,
|
WebSearchDocumentData,
|
||||||
|
WebSearchResultItem,
|
||||||
WebCrawlActionDocument,
|
WebCrawlActionDocument,
|
||||||
WebCrawlActionResult,
|
WebCrawlActionResult,
|
||||||
)
|
)
|
||||||
|
|
@ -62,7 +65,7 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
|
||||||
|
|
||||||
# Step 2: Build ActionResult
|
# Step 2: Build ActionResult
|
||||||
try:
|
try:
|
||||||
result = self._build_search_action_result(search_results)
|
result = self._build_search_action_result(search_results, request.query)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return WebSearchActionResult(success=False, error=str(e))
|
return WebSearchActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
|
@ -78,7 +81,7 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
|
||||||
|
|
||||||
# Step 2: Build ActionResult
|
# Step 2: Build ActionResult
|
||||||
try:
|
try:
|
||||||
result = self._build_crawl_action_result(crawl_results)
|
result = self._build_crawl_action_result(crawl_results, request.urls)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return WebCrawlActionResult(success=False, error=str(e))
|
return WebCrawlActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
|
@ -101,7 +104,7 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
|
||||||
|
|
||||||
# Step 3: Build ActionResult
|
# Step 3: Build ActionResult
|
||||||
try:
|
try:
|
||||||
result = self._build_scrape_action_result(crawl_results)
|
result = self._build_scrape_action_result(crawl_results, request.query)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return WebScrapeActionResult(success=False, error=str(e))
|
return WebScrapeActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
|
@ -124,26 +127,32 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
|
||||||
]
|
]
|
||||||
|
|
||||||
def _build_search_action_result(
|
def _build_search_action_result(
|
||||||
self, search_results: list[TavilySearchResult]
|
self, search_results: list[TavilySearchResult], query: str = ""
|
||||||
) -> WebSearchActionResult:
|
) -> WebSearchActionResult:
|
||||||
"""Builds the ActionResult from the search results."""
|
"""Builds the ActionResult from the search results."""
|
||||||
documents = []
|
# Convert to result items
|
||||||
for result in search_results:
|
result_items = [
|
||||||
document_name = f"web_search_{get_utc_timestamp()}.txt"
|
WebSearchResultItem(title=result.title, url=result.url)
|
||||||
document_data = WebSearchDocumentData(title=result.title, url=result.url)
|
for result in search_results
|
||||||
mime_type = "application/json"
|
]
|
||||||
doc = WebSearchActionDocument(
|
|
||||||
documentName=document_name,
|
|
||||||
documentData=document_data,
|
|
||||||
mimeType=mime_type,
|
|
||||||
)
|
|
||||||
documents.append(doc)
|
|
||||||
|
|
||||||
return WebSearchActionResult(
|
# Create document data with all results
|
||||||
success=True, documents=documents, resultLabel="web_search_results"
|
document_data = WebSearchDocumentData(
|
||||||
|
query=query, results=result_items, total_count=len(result_items)
|
||||||
)
|
)
|
||||||
|
|
||||||
async def _crawl(self, urls: list) -> list[str]:
|
# Create single document
|
||||||
|
document = WebSearchActionDocument(
|
||||||
|
documentName=f"web_search_results_{get_utc_timestamp()}.json",
|
||||||
|
documentData=document_data,
|
||||||
|
mimeType="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
return WebSearchActionResult(
|
||||||
|
success=True, documents=[document], resultLabel="web_search_results"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _crawl(self, urls: list) -> list[TavilyCrawlResult]:
|
||||||
"""Calls the Tavily API to extract text content from URLs."""
|
"""Calls the Tavily API to extract text content from URLs."""
|
||||||
response = await self.client.extract(
|
response = await self.client.extract(
|
||||||
urls=urls, extract_depth="advanced", format="text"
|
urls=urls, extract_depth="advanced", format="text"
|
||||||
|
|
@ -154,43 +163,57 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
|
||||||
]
|
]
|
||||||
|
|
||||||
def _build_crawl_action_result(
|
def _build_crawl_action_result(
|
||||||
self, crawl_results: list[TavilyCrawlResult]
|
self, crawl_results: list[TavilyCrawlResult], urls: list[str] = None
|
||||||
) -> WebCrawlActionResult:
|
) -> WebCrawlActionResult:
|
||||||
"""Builds the ActionResult from the crawl results."""
|
"""Builds the ActionResult from the crawl results."""
|
||||||
documents = []
|
# Convert to result items
|
||||||
for result in crawl_results:
|
result_items = [
|
||||||
document_name = f"web_crawl_{get_utc_timestamp()}.txt"
|
WebCrawlResultItem(url=result.url, content=result.content)
|
||||||
doc_data = WebCrawlDocumentData(
|
for result in crawl_results
|
||||||
url=result["url"], content=result["raw_content"]
|
]
|
||||||
)
|
|
||||||
mime_type = "application/json"
|
# Create document data with all results
|
||||||
doc = WebCrawlActionDocument(
|
document_data = WebCrawlDocumentData(
|
||||||
documentName=document_name,
|
urls=urls or [result.url for result in crawl_results],
|
||||||
documentData=doc_data,
|
results=result_items,
|
||||||
mimeType=mime_type,
|
total_count=len(result_items),
|
||||||
)
|
)
|
||||||
documents.append(doc)
|
|
||||||
|
# Create single document
|
||||||
|
document = WebCrawlActionDocument(
|
||||||
|
documentName=f"web_crawl_results_{get_utc_timestamp()}.json",
|
||||||
|
documentData=document_data,
|
||||||
|
mimeType="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
return WebCrawlActionResult(
|
return WebCrawlActionResult(
|
||||||
success=True, documents=documents, resultLabel="web_crawl_results"
|
success=True, documents=[document], resultLabel="web_crawl_results"
|
||||||
)
|
)
|
||||||
|
|
||||||
def _build_scrape_action_result(
|
def _build_scrape_action_result(
|
||||||
self, crawl_results: list[TavilyCrawlResult]
|
self, crawl_results: list[TavilyCrawlResult], query: str = ""
|
||||||
) -> WebScrapeActionResult:
|
) -> WebScrapeActionResult:
|
||||||
"""Builds the ActionResult from the scrape results."""
|
"""Builds the ActionResult from the scrape results."""
|
||||||
documents = []
|
# Convert to result items
|
||||||
for result in crawl_results:
|
result_items = [
|
||||||
document_name = f"web_scrape_{get_utc_timestamp()}.txt"
|
WebScrapeResultItem(url=result.url, content=result.content)
|
||||||
doc_data = WebScrapeDocumentData(url=result.url, content=result.content)
|
for result in crawl_results
|
||||||
mime_type = "application/json"
|
]
|
||||||
doc = WebScrapeActionDocument(
|
|
||||||
documentName=document_name,
|
# Create document data with all results
|
||||||
documentData=doc_data,
|
document_data = WebScrapeDocumentData(
|
||||||
mimeType=mime_type,
|
query=query,
|
||||||
)
|
results=result_items,
|
||||||
documents.append(doc)
|
total_count=len(result_items),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create single document
|
||||||
|
document = WebScrapeActionDocument(
|
||||||
|
documentName=f"web_scrape_results_{get_utc_timestamp()}.json",
|
||||||
|
documentData=document_data,
|
||||||
|
mimeType="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
return WebScrapeActionResult(
|
return WebScrapeActionResult(
|
||||||
success=True, documents=documents, resultLabel="web_scrape_results"
|
success=True, documents=[document], resultLabel="web_scrape_results"
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,6 @@
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from modules.interfaces.interfaceChatModel import ActionDocument, ActionResult
|
from modules.interfaces.interfaceChatModel import ActionDocument, ActionResult
|
||||||
|
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
|
@ -18,11 +16,21 @@ class WebSearchRequest(BaseModel):
|
||||||
max_results: int
|
max_results: int
|
||||||
|
|
||||||
|
|
||||||
class WebSearchDocumentData(BaseModel):
|
class WebSearchResultItem(BaseModel):
|
||||||
|
"""Individual search result"""
|
||||||
|
|
||||||
title: str
|
title: str
|
||||||
url: str
|
url: str
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchDocumentData(BaseModel):
|
||||||
|
"""Complete search results document"""
|
||||||
|
|
||||||
|
query: str
|
||||||
|
results: List[WebSearchResultItem]
|
||||||
|
total_count: int
|
||||||
|
|
||||||
|
|
||||||
class WebSearchActionDocument(ActionDocument):
|
class WebSearchActionDocument(ActionDocument):
|
||||||
documentData: WebSearchDocumentData
|
documentData: WebSearchDocumentData
|
||||||
|
|
||||||
|
|
@ -45,14 +53,24 @@ class WebCrawlRequest(BaseModel):
|
||||||
urls: List[str]
|
urls: List[str]
|
||||||
|
|
||||||
|
|
||||||
class WebCrawlDocumentData(BaseModel):
|
class WebCrawlResultItem(BaseModel):
|
||||||
|
"""Individual crawl result"""
|
||||||
|
|
||||||
url: str
|
url: str
|
||||||
content: str
|
content: str
|
||||||
|
|
||||||
|
|
||||||
|
class WebCrawlDocumentData(BaseModel):
|
||||||
|
"""Complete crawl results document"""
|
||||||
|
|
||||||
|
urls: List[str]
|
||||||
|
results: List[WebCrawlResultItem]
|
||||||
|
total_count: int
|
||||||
|
|
||||||
|
|
||||||
class WebCrawlActionDocument(ActionDocument):
|
class WebCrawlActionDocument(ActionDocument):
|
||||||
documentData: WebCrawlDocumentData = Field(
|
documentData: WebCrawlDocumentData = Field(
|
||||||
description="The data extracted from a single crawled URL"
|
description="The data extracted from crawled URLs"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -75,14 +93,24 @@ class WebScrapeRequest(BaseModel):
|
||||||
max_results: int
|
max_results: int
|
||||||
|
|
||||||
|
|
||||||
class WebScrapeDocumentData(BaseModel):
|
class WebScrapeResultItem(BaseModel):
|
||||||
|
"""Individual scrape result"""
|
||||||
|
|
||||||
url: str
|
url: str
|
||||||
content: str
|
content: str
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeDocumentData(BaseModel):
|
||||||
|
"""Complete scrape results document"""
|
||||||
|
|
||||||
|
query: str
|
||||||
|
results: List[WebScrapeResultItem]
|
||||||
|
total_count: int
|
||||||
|
|
||||||
|
|
||||||
class WebScrapeActionDocument(ActionDocument):
|
class WebScrapeActionDocument(ActionDocument):
|
||||||
documentData: WebScrapeDocumentData = Field(
|
documentData: WebScrapeDocumentData = Field(
|
||||||
description="The data extracted from a single scraped URL"
|
description="The data extracted from scraped URLs"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,11 @@ from typing import Any, Dict
|
||||||
from modules.chat.methodBase import MethodBase, action
|
from modules.chat.methodBase import MethodBase, action
|
||||||
from modules.interfaces.interfaceChatModel import ActionResult
|
from modules.interfaces.interfaceChatModel import ActionResult
|
||||||
from modules.interfaces.interface_web_objects import WebInterface
|
from modules.interfaces.interface_web_objects import WebInterface
|
||||||
from modules.interfaces.interface_web_model import WebSearchRequest
|
from modules.interfaces.interface_web_model import (
|
||||||
|
WebSearchRequest,
|
||||||
|
WebCrawlRequest,
|
||||||
|
WebScrapeRequest,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -14,11 +18,14 @@ class MethodWeb(MethodBase):
|
||||||
|
|
||||||
def __init__(self, serviceCenter: Any):
|
def __init__(self, serviceCenter: Any):
|
||||||
super().__init__(serviceCenter)
|
super().__init__(serviceCenter)
|
||||||
|
self.name = "web"
|
||||||
|
self.description = "Web search, crawling, and scraping operations using Tavily"
|
||||||
|
|
||||||
@action
|
@action
|
||||||
async def search(self, parameters: Dict[str, Any]) -> ActionResult:
|
async def search(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
"""
|
"""Perform a web search and outputs a .json file with a list of found URLs.
|
||||||
Perform a web search and output a .txt file with a plain list of URLs (one per line).
|
|
||||||
|
Each result contains "title" and "url".
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
query (str): Search query to perform
|
query (str): Search query to perform
|
||||||
|
|
@ -41,3 +48,150 @@ class MethodWeb(MethodBase):
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return ActionResult(success=False, error=str(e))
|
return ActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
@action
|
||||||
|
async def crawl(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
|
"""Crawls a list of URLs and extracts information from them.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
document (str): Document reference containing URL list from search results
|
||||||
|
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
document_ref = parameters.get("document")
|
||||||
|
|
||||||
|
if not document_ref:
|
||||||
|
return ActionResult(
|
||||||
|
success=False, error="No document reference provided."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Resolve document reference to ChatDocument objects
|
||||||
|
chat_documents = self.service.getChatDocumentsFromDocumentList(
|
||||||
|
[document_ref]
|
||||||
|
)
|
||||||
|
|
||||||
|
if not chat_documents:
|
||||||
|
return ActionResult(
|
||||||
|
success=False,
|
||||||
|
error=f"No documents found for reference: {document_ref}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get the first document (search results)
|
||||||
|
search_doc = chat_documents[0]
|
||||||
|
|
||||||
|
# Get file data using the service center
|
||||||
|
file_data = self.service.getFileData(search_doc.fileId)
|
||||||
|
if not file_data:
|
||||||
|
return ActionResult(
|
||||||
|
success=False, error="Could not retrieve file data for document"
|
||||||
|
)
|
||||||
|
|
||||||
|
content = file_data.decode("utf-8")
|
||||||
|
|
||||||
|
# Parse JSON to extract URLs from search results
|
||||||
|
import json
|
||||||
|
|
||||||
|
try:
|
||||||
|
# The document structure from WebSearchActionResult
|
||||||
|
search_data = json.loads(content)
|
||||||
|
|
||||||
|
# Extract URLs from the search results structure
|
||||||
|
urls = []
|
||||||
|
if isinstance(search_data, dict):
|
||||||
|
# Handle the document structure: documentData contains the actual search results
|
||||||
|
doc_data = search_data.get("documentData", search_data)
|
||||||
|
if "results" in doc_data and isinstance(doc_data["results"], list):
|
||||||
|
urls = [
|
||||||
|
result["url"]
|
||||||
|
for result in doc_data["results"]
|
||||||
|
if isinstance(result, dict) and "url" in result
|
||||||
|
]
|
||||||
|
elif "urls" in doc_data and isinstance(doc_data["urls"], list):
|
||||||
|
# Fallback: if URLs are stored directly in a 'urls' field
|
||||||
|
urls = [url for url in doc_data["urls"] if isinstance(url, str)]
|
||||||
|
|
||||||
|
# Fallback: try to parse as plain text with regex (for backward compatibility)
|
||||||
|
if not urls:
|
||||||
|
logger.warning(
|
||||||
|
"Could not extract URLs from JSON structure, trying plain text parsing"
|
||||||
|
)
|
||||||
|
import re
|
||||||
|
|
||||||
|
urls = re.split(r"[\n,;]+", content)
|
||||||
|
urls = [
|
||||||
|
u.strip()
|
||||||
|
for u in urls
|
||||||
|
if u.strip()
|
||||||
|
and (
|
||||||
|
u.strip().startswith("http://")
|
||||||
|
or u.strip().startswith("https://")
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Fallback to plain text parsing if JSON parsing fails
|
||||||
|
logger.warning("Document is not valid JSON, trying plain text parsing")
|
||||||
|
import re
|
||||||
|
|
||||||
|
urls = re.split(r"[\n,;]+", content)
|
||||||
|
urls = [
|
||||||
|
u.strip()
|
||||||
|
for u in urls
|
||||||
|
if u.strip()
|
||||||
|
and (
|
||||||
|
u.strip().startswith("http://")
|
||||||
|
or u.strip().startswith("https://")
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
if not urls:
|
||||||
|
return ActionResult(
|
||||||
|
success=False, error="No valid URLs found in the document."
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Extracted {len(urls)} URLs from document: {urls}")
|
||||||
|
|
||||||
|
# Prepare request data
|
||||||
|
web_crawl_request = WebCrawlRequest(urls=urls)
|
||||||
|
|
||||||
|
# Perform request
|
||||||
|
web_interface = await WebInterface.create()
|
||||||
|
web_crawl_result = await web_interface.crawl(web_crawl_request)
|
||||||
|
|
||||||
|
return web_crawl_result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in crawl method: {str(e)}")
|
||||||
|
return ActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
@action
|
||||||
|
async def scrape(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
|
"""Scrapes web content by searching for URLs and then extracting their content.
|
||||||
|
|
||||||
|
Combines search and crawl operations in one step.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
query (str): Search query to perform
|
||||||
|
maxResults (int, optional): Maximum number of results (default: 10)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
query = parameters.get("query")
|
||||||
|
max_results = parameters.get("maxResults", 10)
|
||||||
|
|
||||||
|
if not query:
|
||||||
|
return ActionResult(success=False, error="Search query is required")
|
||||||
|
|
||||||
|
# Prepare request data
|
||||||
|
web_scrape_request = WebScrapeRequest(
|
||||||
|
query=query,
|
||||||
|
max_results=max_results,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Perform request
|
||||||
|
web_interface = await WebInterface.create()
|
||||||
|
web_scrape_result = await web_interface.scrape(web_scrape_request)
|
||||||
|
|
||||||
|
return web_scrape_result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return ActionResult(success=False, error=str(e))
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue