gateway/modules/datamodels/datamodelDocref.py
2026-04-30 23:54:45 +02:00

205 lines
7.7 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Document reference models for typed document references in workflows.
"""
import logging
from typing import Any, List, Optional
from pydantic import BaseModel, Field
from modules.shared.i18nRegistry import i18nModel
logger = logging.getLogger(__name__)
class DocumentReference(BaseModel):
"""Base class for document references"""
pass
@i18nModel("Dokumentlisten-Referenz")
class DocumentListReference(DocumentReference):
"""Reference to a document list via message label"""
messageId: Optional[str] = Field(
None,
description="Optional message ID for cross-round references",
json_schema_extra={"label": "Nachrichten-ID"},
)
label: str = Field(
description="Document list label",
json_schema_extra={"label": "Bezeichnung"},
)
def to_string(self) -> str:
"""Convert to string format: docList:messageId:label or docList:label"""
if self.messageId:
return f"docList:{self.messageId}:{self.label}"
return f"docList:{self.label}"
@i18nModel("Dokumentelement-Referenz")
class DocumentItemReference(DocumentReference):
"""Reference to a specific document item"""
documentId: str = Field(
description="Document ID",
json_schema_extra={"label": "Dokument-ID"},
)
fileName: Optional[str] = Field(
None,
description="Optional file name",
json_schema_extra={"label": "Dateiname"},
)
def to_string(self) -> str:
"""Convert to string format: docItem:documentId:fileName or docItem:documentId"""
if self.fileName:
return f"docItem:{self.documentId}:{self.fileName}"
return f"docItem:{self.documentId}"
@i18nModel("Dokumentreferenz-Liste")
class DocumentReferenceList(BaseModel):
"""List of document references with conversion methods"""
references: List[DocumentReference] = Field(
default_factory=list,
description="List of document references",
json_schema_extra={"label": "Referenzen"},
)
def to_string_list(self) -> List[str]:
"""Convert all references to string list"""
return [ref.to_string() for ref in self.references]
@classmethod
def from_string_list(cls, stringList: List[str]) -> "DocumentReferenceList":
"""Parse string list to typed references
Supports formats:
- docList:label
- docList:messageId:label
- docItem:documentId
- docItem:documentId:fileName
"""
references = []
for refStr in stringList:
if not refStr or not isinstance(refStr, str):
continue
refStr = refStr.strip()
# Parse docList: references
if refStr.startswith("docList:"):
parts = refStr[8:].split(":", 1) # Remove "docList:" prefix
if len(parts) == 2:
# docList:messageId:label
messageId, label = parts
references.append(DocumentListReference(messageId=messageId, label=label))
elif len(parts) == 1 and parts[0]:
# docList:label
references.append(DocumentListReference(label=parts[0]))
# Parse docItem: references
elif refStr.startswith("docItem:"):
parts = refStr[8:].split(":", 1) # Remove "docItem:" prefix
if len(parts) == 2:
# docItem:documentId:fileName
documentId, fileName = parts
references.append(DocumentItemReference(documentId=documentId, fileName=fileName))
elif len(parts) == 1 and parts[0]:
# docItem:documentId
references.append(DocumentItemReference(documentId=parts[0]))
else:
if not refStr:
continue
import re
if re.match(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', refStr, re.I):
references.append(DocumentItemReference(documentId=refStr))
else:
references.append(DocumentListReference(label=refStr))
return cls(references=references)
def coerceDocumentReferenceList(value: Any) -> DocumentReferenceList:
"""Tolerant coercion of any agent/UI-supplied document list to
:class:`DocumentReferenceList`.
Accepts the canonical formats plus the dict-wrapper shapes that
LLM tool-callers tend to generate when they see a
``type=DocumentList`` parameter:
* ``None`` / ``""`` -> empty list
* :class:`DocumentReferenceList` -> as-is
* ``str`` -> single-element string list
* ``list[str]`` -> :meth:`from_string_list`
* ``list[dict]`` with ``id`` or ``documentId`` -> item references
* ``{"documents": [...]}`` / ``{"references": [...]}`` ->
recurse into the inner list (this is the shape LLMs love)
* ``{"id": "..."}`` / ``{"documentId": "..."}`` -> single
item reference
* any unrecognised input -> empty list with a WARN log; never
raises (the caller decides whether an empty list is fatal).
"""
if value is None or value == "":
return DocumentReferenceList(references=[])
if isinstance(value, DocumentReferenceList):
return value
if isinstance(value, str):
return DocumentReferenceList.from_string_list([value])
if isinstance(value, dict):
for innerKey in ("documents", "references", "items", "files"):
if innerKey in value and isinstance(value[innerKey], list):
return coerceDocumentReferenceList(value[innerKey])
docId = value.get("documentId") or value.get("id")
if docId:
return DocumentReferenceList(references=[
DocumentItemReference(
documentId=str(docId),
fileName=value.get("fileName") or value.get("name"),
)
])
logger.warning(
f"coerceDocumentReferenceList: unsupported dict shape "
f"(keys={list(value.keys())}); returning empty list."
)
return DocumentReferenceList(references=[])
if isinstance(value, list):
if not value:
return DocumentReferenceList(references=[])
first = value[0]
if isinstance(first, str):
return DocumentReferenceList.from_string_list(value)
if isinstance(first, dict):
references: List[DocumentReference] = []
for item in value:
if not isinstance(item, dict):
continue
docId = item.get("documentId") or item.get("id")
if docId:
references.append(DocumentItemReference(
documentId=str(docId),
fileName=item.get("fileName") or item.get("name"),
))
elif item.get("label"):
references.append(DocumentListReference(
label=str(item["label"]),
messageId=item.get("messageId"),
))
return DocumentReferenceList(references=references)
# Mixed/object list (e.g. inline ActionDocument-like): caller
# must pre-handle that case before calling this coercer.
logger.warning(
f"coerceDocumentReferenceList: list element type "
f"{type(first).__name__} not recognised; returning empty list."
)
return DocumentReferenceList(references=[])
logger.warning(
f"coerceDocumentReferenceList: unsupported value type "
f"{type(value).__name__}; returning empty list."
)
return DocumentReferenceList(references=[])