diff --git a/modules/connectors/providerInfomaniak/connectorInfomaniak.py b/modules/connectors/providerInfomaniak/connectorInfomaniak.py index cb592261..f8512893 100644 --- a/modules/connectors/providerInfomaniak/connectorInfomaniak.py +++ b/modules/connectors/providerInfomaniak/connectorInfomaniak.py @@ -234,6 +234,21 @@ async def listAccessibleDrives(token: str) -> List[Dict[str, Any]]: return [d for d in drives if isinstance(d, dict) and d.get("id")] +def _lastNumericSegment(segments: List[str]) -> Optional[str]: + """Return the last all-digit segment (kDrive file/folder IDs are int). + + The agent sometimes appends the human-readable filename to a path, + e.g. ``/2980592/12/platform-overview.html``. The kDrive API does + not accept names -- only numeric IDs -- so we strip trailing + non-numeric segments and pick the last integer ID. + Returns ``None`` if no numeric segment exists. + """ + for seg in reversed(segments): + if seg.isdigit(): + return seg + return None + + class KdriveAdapter(ServiceAdapter): """kDrive ServiceAdapter -- browse drives, folders, files. @@ -246,6 +261,14 @@ class KdriveAdapter(ServiceAdapter): The drive list is cached on the adapter instance so each browse pays for one ``/2/drive/init`` call at most. + + File-vs-folder handling: a DataSource may point at a single file + (e.g. ``/{driveId}/{fileId}`` where ``fileId`` is a regular file). + Calling ``/files/{fileId}/files`` on a file answers + ``400 destination_not_a_directory`` -- so :meth:`browse` first + fetches the item's metadata and, if ``type=file``, returns a + one-element list describing the file itself instead of pretending + the directory is empty. """ def __init__(self, accessToken: str): @@ -258,6 +281,14 @@ class KdriveAdapter(ServiceAdapter): self._drives = await listAccessibleDrives(self._token) return self._drives + async def _fetchItemMeta(self, driveId: str, fileId: str) -> Optional[Dict[str, Any]]: + """Return the kDrive file/folder metadata dict, or ``None`` on error.""" + meta = await _infomaniakGet(self._token, f"/2/drive/{driveId}/files/{fileId}") + if not isinstance(meta, dict) or meta.get("error"): + return None + data = _unwrapData(meta) + return data if isinstance(data, dict) else None + async def browse( self, path: str, @@ -274,7 +305,21 @@ class KdriveAdapter(ServiceAdapter): if len(segments) == 1: return await self._listChildren(driveId, fileId=None, limit=limit) - fileId = segments[-1] + fileId = _lastNumericSegment(segments[1:]) + if fileId is None: + return [] + + meta = await self._fetchItemMeta(driveId, fileId) + if meta is not None and meta.get("type") == "file": + return [ExternalEntry( + name=meta.get("name") or fileId, + path=f"/{driveId}/{fileId}", + isFolder=False, + size=meta.get("size"), + mimeType=meta.get("mime_type"), + lastModified=meta.get("last_modified_at"), + metadata={"id": fileId, "kind": "file"}, + )] return await self._listChildren(driveId, fileId=fileId, limit=limit) async def _listDrives(self) -> List[ExternalEntry]: @@ -341,16 +386,16 @@ class KdriveAdapter(ServiceAdapter): segments = [s for s in (path or "").strip("/").split("/") if s] if len(segments) < 2: return DownloadResult() - driveId, fileId = segments[0], segments[-1] + driveId = segments[0] + # Agent may append the filename: ``/{driveId}/{fileId}/{name}``. + # Pull the last numeric segment instead of trusting segments[-1]. + fileId = _lastNumericSegment(segments[1:]) + if fileId is None: + return DownloadResult() - meta = await _infomaniakGet(self._token, f"/2/drive/{driveId}/files/{fileId}") - fileName = fileId - mimeType = "application/octet-stream" - if isinstance(meta, dict) and not meta.get("error"): - data = _unwrapData(meta) - if isinstance(data, dict): - fileName = data.get("name") or fileId - mimeType = data.get("mime_type") or mimeType + meta = await self._fetchItemMeta(driveId, fileId) + fileName = (meta or {}).get("name") or fileId + mimeType = (meta or {}).get("mime_type") or "application/octet-stream" content = await _infomaniakDownload( self._token, f"/2/drive/{driveId}/files/{fileId}/download" diff --git a/modules/datamodels/datamodelDocref.py b/modules/datamodels/datamodelDocref.py index e4a43bd2..27ba5e2b 100644 --- a/modules/datamodels/datamodelDocref.py +++ b/modules/datamodels/datamodelDocref.py @@ -4,10 +4,13 @@ Document reference models for typed document references in workflows. """ -from typing import List, Optional +import logging +from typing import Any, List, Optional from pydantic import BaseModel, Field from modules.shared.i18nRegistry import i18nModel +logger = logging.getLogger(__name__) + class DocumentReference(BaseModel): """Base class for document references""" @@ -115,3 +118,86 @@ class DocumentReferenceList(BaseModel): references.append(DocumentListReference(label=refStr)) return cls(references=references) + + +def coerceDocumentReferenceList(value: Any) -> DocumentReferenceList: + """Tolerant coercion of any agent/UI-supplied document list to + :class:`DocumentReferenceList`. + + Accepts the canonical formats plus the dict-wrapper shapes that + LLM tool-callers tend to generate when they see a + ``type=DocumentList`` parameter: + + * ``None`` / ``""`` -> empty list + * :class:`DocumentReferenceList` -> as-is + * ``str`` -> single-element string list + * ``list[str]`` -> :meth:`from_string_list` + * ``list[dict]`` with ``id`` or ``documentId`` -> item references + * ``{"documents": [...]}`` / ``{"references": [...]}`` -> + recurse into the inner list (this is the shape LLMs love) + * ``{"id": "..."}`` / ``{"documentId": "..."}`` -> single + item reference + * any unrecognised input -> empty list with a WARN log; never + raises (the caller decides whether an empty list is fatal). + """ + if value is None or value == "": + return DocumentReferenceList(references=[]) + if isinstance(value, DocumentReferenceList): + return value + if isinstance(value, str): + return DocumentReferenceList.from_string_list([value]) + + if isinstance(value, dict): + for innerKey in ("documents", "references", "items", "files"): + if innerKey in value and isinstance(value[innerKey], list): + return coerceDocumentReferenceList(value[innerKey]) + docId = value.get("documentId") or value.get("id") + if docId: + return DocumentReferenceList(references=[ + DocumentItemReference( + documentId=str(docId), + fileName=value.get("fileName") or value.get("name"), + ) + ]) + logger.warning( + f"coerceDocumentReferenceList: unsupported dict shape " + f"(keys={list(value.keys())}); returning empty list." + ) + return DocumentReferenceList(references=[]) + + if isinstance(value, list): + if not value: + return DocumentReferenceList(references=[]) + first = value[0] + if isinstance(first, str): + return DocumentReferenceList.from_string_list(value) + if isinstance(first, dict): + references: List[DocumentReference] = [] + for item in value: + if not isinstance(item, dict): + continue + docId = item.get("documentId") or item.get("id") + if docId: + references.append(DocumentItemReference( + documentId=str(docId), + fileName=item.get("fileName") or item.get("name"), + )) + elif item.get("label"): + references.append(DocumentListReference( + label=str(item["label"]), + messageId=item.get("messageId"), + )) + return DocumentReferenceList(references=references) + # Mixed/object list (e.g. inline ActionDocument-like): caller + # must pre-handle that case before calling this coercer. + logger.warning( + f"coerceDocumentReferenceList: list element type " + f"{type(first).__name__} not recognised; returning empty list." + ) + return DocumentReferenceList(references=[]) + + logger.warning( + f"coerceDocumentReferenceList: unsupported value type " + f"{type(value).__name__}; returning empty list." + ) + return DocumentReferenceList(references=[]) diff --git a/modules/interfaces/interfaceDbManagement.py b/modules/interfaces/interfaceDbManagement.py index e6cee0b8..f72597b3 100644 --- a/modules/interfaces/interfaceDbManagement.py +++ b/modules/interfaces/interfaceDbManagement.py @@ -836,13 +836,25 @@ class ComponentObjects: def checkForDuplicateFile(self, fileHash: str, fileName: str) -> Optional[FileItem]: """Checks if a file with the same hash AND fileName already exists for the current user **within the same scope** (mandateId + featureInstanceId). - - Duplicate = same user + same fileHash + same fileName + same scope. + + Duplicate = same user + same fileHash + same fileName + same scope + RBAC-visible. Same hash with different name is allowed (intentional copy by user). + + RBAC parity contract: this method must NEVER return a FileItem that + ``getFile()`` would not return for the current user. Otherwise callers + (``saveUploadedFile`` / ``createFile``) hand back an id that the very + next ``updateFile`` / ``getFile`` then rejects with + ``File with ID ... not found`` -- the well-known "ghost duplicate" + symptom seen when ``interfaceDbComponent`` is initialised without an + ``featureInstanceId`` (e.g. via ``serviceHub``) but a same-hash+name + file exists in another featureInstance under the same mandate. + We therefore cross-check the candidate through the RBAC-aware ``getFile`` + before returning it; if RBAC blocks it, we treat it as "no duplicate + for this scope" and the caller will create a fresh per-scope copy. """ if not self.userId: return None - + recordFilter: dict = { "sysCreatedBy": self.userId, "fileHash": fileHash, @@ -857,10 +869,10 @@ class ComponentObjects: FileItem, recordFilter=recordFilter, ) - + if not matchingFiles: return None - + file = matchingFiles[0] fileId = file["id"] @@ -869,16 +881,17 @@ class ComponentObjects: logger.warning(f"Duplicate FileItem {fileId} found but FileData missing — treating as new file") return None - return FileItem( - id=fileId, - mandateId=file.get("mandateId", ""), - featureInstanceId=file.get("featureInstanceId", ""), - fileName=file["fileName"], - mimeType=file["mimeType"], - fileHash=file["fileHash"], - fileSize=file["fileSize"], - sysCreatedAt=file.get("sysCreatedAt"), - ) + rbacVisible = self.getFile(fileId) + if rbacVisible is None: + logger.info( + f"Duplicate FileItem {fileId} ('{fileName}', hash {fileHash[:12]}...) found via " + f"sysCreatedBy+hash+name match but is not RBAC-visible in current scope " + f"(mandateId={self.mandateId or '-'}, featureInstanceId={self.featureInstanceId or '-'}). " + f"Treating as no-duplicate so a fresh per-scope copy gets created." + ) + return None + + return rbacVisible # Class-level cache — built once from the ExtractorRegistry _extensionToMime: Optional[Dict[str, str]] = None diff --git a/modules/serviceCenter/services/serviceAgent/conversationManager.py b/modules/serviceCenter/services/serviceAgent/conversationManager.py index 57e169be..055eae25 100644 --- a/modules/serviceCenter/services/serviceAgent/conversationManager.py +++ b/modules/serviceCenter/services/serviceAgent/conversationManager.py @@ -392,6 +392,18 @@ def buildSystemPrompt( "- Prefer modular file structures over monolithic files.\n" "- When generating applications, create separate files for logical components.\n" "- Always plan the structure before writing code.\n\n" + "### Document references for AI tools (CRITICAL)\n" + "Tools that produce a file (`downloadFromDataSource`, `writeFile mode=create`, " + "`renderDocument`, `generateImage`, `createChart`) return a result line with TWO ids:\n" + "- `documentList ref: docItem:` — pass this STRING VERBATIM as an entry of " + " `documentList` for `ai_process`, `ai_summarizeDocument`, `context_extractContent`, " + " `context_neutralizeData`, etc. Always as the literal `docItem:` — do NOT wrap " + " in `{\"documents\":[{\"id\":...}]}` and do NOT use the file id here, the documentList " + " resolver only matches `docItem:` references.\n" + "- `file id: ` — use for `readFile`, `searchInFileContent`, `writeFile mode=append`, " + " and image embeds (`![alt](file:)`).\n" + "Example: after `downloadFromDataSource` returns `docItem:abc123`, call " + "`ai_summarizeDocument(documentList=[\"docItem:abc123\"], summaryLength=\"medium\")`.\n\n" ) if toolsFormatted: diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_dataSourceTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_dataSourceTools.py index abe24c95..96ee31bb 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_dataSourceTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_dataSourceTools.py @@ -9,7 +9,9 @@ from modules.serviceCenter.services.serviceAgent.datamodelAgent import ToolResul from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistry from modules.serviceCenter.services.serviceAgent.coreTools._helpers import ( + _attachFileAsChatDocument, _buildResolverDbFromServices, + _formatToolFileResult, _getOrCreateTempFolder, _looksLikeBinary, _resolveFileScope, @@ -231,11 +233,27 @@ def _registerDataSourceTools(registry: ToolRegistry, services): tempFolderId = _getOrCreateTempFolder(chatService) if tempFolderId: chatService.interfaceDbComponent.updateFile(fileItem.id, {"folderId": tempFolderId}) + + chatDocId = _attachFileAsChatDocument( + services, fileItem, + label=f"datasource:{dsId or directService or 'download'}", + userMessage=f"Downloaded {fileName} from external data source", + ) + ext = fileName.rsplit(".", 1)[-1].lower() if "." in fileName else "" - hint = "Use readFile to read the text content." if ext in ("doc", "docx", "txt", "csv", "json", "xml", "html", "md", "rtf", "odt", "xls", "xlsx", "pptx", "pdf", "eml", "msg") else "Use readFile to access the content." + hint = ( + "Use readFile to read the text content." + if ext in ("doc", "docx", "txt", "csv", "json", "xml", "html", "md", "rtf", "odt", "xls", "xlsx", "pptx", "pdf", "eml", "msg") + else "Use readFile to access the content." + ) return ToolResult( toolCallId="", toolName="downloadFromDataSource", success=True, - data=f"Downloaded '{fileName}' ({len(fileBytes)} bytes) → local file id: {fileItem.id}. {hint}" + data=_formatToolFileResult( + fileItem=fileItem, + chatDocId=chatDocId, + actionLabel="Downloaded", + extraInfo=hint, + ), ) except Exception as e: return ToolResult(toolCallId="", toolName="downloadFromDataSource", success=False, error=str(e)) @@ -308,8 +326,15 @@ def _registerDataSourceTools(registry: ToolRegistry, services): registry.register( "downloadFromDataSource", _downloadFromDataSource, description=( - "Download a file or email from a data source into local storage. Returns a local file ID " - "to read with readFile. Accepts either dataSourceId OR connectionId+service. " + "Download a file or email from a data source into local storage. " + "The result line contains TWO ids you must use for different purposes:\n" + " - `documentList ref: docItem:` -- pass this string verbatim " + " inside the `documentList` parameter of `ai_process`, " + " `ai_summarizeDocument`, `context_extractContent`, `context_neutralizeData`, etc. " + " Always use the `docItem:` form, NOT the file id, NOT a `{\"documents\":[{\"id\":...}]}` " + " wrapper -- the documentList resolver only matches `docItem:` references against the workflow.\n" + " - `file id: ` -- pass this to `readFile`, `searchInFileContent`, image embeds (`file:`).\n" + "Accepts either dataSourceId OR connectionId+service. " "For email sources (Outlook, Gmail), browse/search only return subjects -- use this to get full content." ), parameters={ diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py b/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py index 6919ca18..129de517 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py @@ -3,7 +3,8 @@ """Shared helpers for core agent tools (file scope, binary detection, temp folder).""" import logging -from typing import Any, Optional +import uuid +from typing import Any, Dict, Optional, Tuple logger = logging.getLogger(__name__) @@ -78,6 +79,138 @@ def _getOrCreateTempFolder(chatService) -> Optional[str]: return None +def _attachFileAsChatDocument( + services: Any, + fileItem: Any, + *, + label: str = "agent_tool_output", + userMessage: str = "", + role: str = "assistant", +) -> Optional[str]: + """Bind a persisted FileItem to the active workflow as a ChatDocument. + + This is the **single canonical bridge** between agent-tool-produced + artefacts and the workflow's document model. Mirrors the pattern + used by workflow actions (``workflowProcessor.persistTaskResult`` / + ``methodTrustee.extractFromFiles``): every artefact a workflow step + -- including agent tools -- materialises ends up addressable via + ``docItem:`` so downstream tools that consume + ``documentList`` can resolve it against + ``workflow.messages[*].documents[*].id``. + + Without this bind the agent's ``downloadFromDataSource`` / + ``writeFile(create)`` / ``renderDocument`` / ``generateImage`` / + ``createChart`` outputs are FileItem-only and unreachable from + ``getChatDocumentsFromDocumentList`` -- the symptom is + ``ai_summarizeDocument`` etc. running with 0 ContentParts. + + Args: + services: agent-tool services container (must expose ``.chat``). + fileItem: persisted FileItem (Pydantic obj or dict) returned + from ``saveUploadedFile`` / ``createFile`` / + ``saveGeneratedFile``. + label: ``documentsLabel`` for the carrier ChatMessage -- + picked up by ``docList: