# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Knowledge Store data models: FileContentIndex, ContentChunk, WorkflowMemory. These models support the 3-tier RAG architecture: - Personal Layer: scope=personal, userId-scoped - Instance Layer: scope=featureInstance, featureInstanceId-scoped - Mandate Layer: scope=mandate, mandateId-scoped (visible to all mandate users) - Global Layer: scope=global (sysAdmin only) - Workflow Layer: workflowId-scoped (WorkflowMemory) Vector fields use json_schema_extra={"db_type": "vector(1536)"} for pgvector. """ from typing import Dict, Any, List, Optional from pydantic import BaseModel, Field from modules.datamodels.datamodelBase import PowerOnModel from modules.shared.i18nRegistry import i18nModel from modules.shared.timeUtils import getUtcTimestamp import uuid @i18nModel("Datei-Inhaltsindex") class FileContentIndex(PowerOnModel): """Struktureller Index der Inhaltsobjekte einer Datei.""" id: str = Field( default_factory=lambda: str(uuid.uuid4()), description="Primary key (typically = fileId)", json_schema_extra={"label": "ID"}, ) userId: str = Field( description="Owner user ID", json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "UserInDB", "labelField": "username"}}, ) featureInstanceId: str = Field( default="", description="Feature instance scope", json_schema_extra={"label": "Feature-Instanz-ID", "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}}, ) mandateId: str = Field( default="", description="Mandate scope", json_schema_extra={"label": "Mandanten-ID", "fk_target": {"db": "poweron_app", "table": "Mandate", "labelField": "label"}}, ) fileName: str = Field( description="Original file name", json_schema_extra={"label": "Dateiname"}, ) mimeType: str = Field( description="MIME type of the file", json_schema_extra={"label": "MIME-Typ"}, ) containerPath: Optional[str] = Field( default=None, description="Path within a container (e.g. 'archive.zip/folder/report.pdf')", json_schema_extra={"label": "Container-Pfad"}, ) totalObjects: int = Field( default=0, description="Total number of content objects extracted", json_schema_extra={"label": "Anzahl Objekte"}, ) totalSize: int = Field( default=0, description="Total size of all content objects in bytes", json_schema_extra={"label": "Gesamtgroesse"}, ) structure: Dict[str, Any] = Field( default_factory=dict, description="Structural overview (pages, sections, hierarchy)", json_schema_extra={"label": "Struktur"}, ) objectSummary: List[Dict[str, Any]] = Field( default_factory=list, description="Compact summary per content object", json_schema_extra={"label": "Objekt-Zusammenfassung"}, ) extractedAt: float = Field( default_factory=getUtcTimestamp, description="Extraction timestamp", json_schema_extra={"label": "Extrahiert am", "frontend_type": "timestamp"}, ) status: str = Field( default="pending", description="Processing status: pending, extracted, embedding, indexed, failed", json_schema_extra={"label": "Status"}, ) scope: str = Field( default="personal", description="Data visibility scope: personal, featureInstance, mandate, global", json_schema_extra={"label": "Sichtbarkeit"}, ) sourceKind: str = Field( default="file", description="Origin of the indexed content: file, sharepoint_item, outlook_message, outlook_attachment, ...", json_schema_extra={"label": "Quellenart"}, ) connectionId: Optional[str] = Field( default=None, description="UserConnection ID if this index entry originates from an external connector", json_schema_extra={"label": "Connection-ID"}, ) neutralizationStatus: Optional[str] = Field( default=None, description="Neutralization status: completed, failed, skipped, None = not required", json_schema_extra={"label": "Neutralisierungsstatus"}, ) isNeutralized: bool = Field( default=False, description="True if content was neutralized before indexing", json_schema_extra={"label": "Neutralisiert"}, ) @i18nModel("Inhalts-Chunk") class ContentChunk(PowerOnModel): """Persistierter Inhalts-Chunk mit Embedding-Vektor.""" id: str = Field( default_factory=lambda: str(uuid.uuid4()), description="Primary key", json_schema_extra={"label": "ID"}, ) contentObjectId: str = Field( description="Reference to the content object within FileContentIndex", json_schema_extra={"label": "Inhaltsobjekt-ID"}, ) fileId: str = Field( description="FK to the source file", json_schema_extra={"label": "Datei-ID", "fk_target": {"db": "poweron_management", "table": "FileItem", "labelField": "fileName"}}, ) userId: str = Field( description="Owner user ID", json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "UserInDB", "labelField": "username"}}, ) featureInstanceId: str = Field( default="", description="Feature instance scope", json_schema_extra={"label": "Feature-Instanz-ID", "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}}, ) contentType: str = Field( description="Content type: text, image, videostream, audiostream, other", json_schema_extra={"label": "Inhaltstyp"}, ) data: str = Field( description="Content data (text, base64, URL)", json_schema_extra={"label": "Daten"}, ) contextRef: Dict[str, Any] = Field( default_factory=dict, description="Context reference (page, position, label)", json_schema_extra={"label": "Kontext-Referenz"}, ) summary: Optional[str] = Field( default=None, description="AI-generated summary (on demand)", json_schema_extra={"label": "Zusammenfassung"}, ) chunkMetadata: Dict[str, Any] = Field( default_factory=dict, description="Additional metadata", json_schema_extra={"label": "Metadaten"}, ) embedding: Optional[List[float]] = Field( default=None, description="pgvector embedding (NOT NULL for text chunks)", json_schema_extra={"label": "Embedding", "db_type": "vector(1536)"}, ) @i18nModel("Runden-Speicher") class RoundMemory(PowerOnModel): """Persistenter Speicher pro Agenten-Runde.""" id: str = Field( default_factory=lambda: str(uuid.uuid4()), description="Primary key", json_schema_extra={"label": "ID"}, ) workflowId: str = Field( description="FK to the workflow", json_schema_extra={"label": "Workflow-ID"}, ) roundNumber: int = Field( default=0, description="Agent round that produced this memory", json_schema_extra={"label": "Rundennummer"}, ) memoryType: str = Field( description="Category: file_ref, tool_result, decision, data_source_ref", json_schema_extra={"label": "Speichertyp"}, ) key: str = Field( description="Dedup key, e.g. 'readFile:' or 'plan'", json_schema_extra={"label": "Schluessel"}, ) summary: str = Field( default="", description="Compact summary (max ~2000 chars)", json_schema_extra={"label": "Zusammenfassung"}, ) fullData: Optional[str] = Field( default=None, description="Full tool output when small enough (max ~8000 chars)", json_schema_extra={"label": "Volldaten"}, ) fileIds: List[str] = Field( default_factory=list, description="Referenced file IDs", json_schema_extra={"label": "Datei-IDs"}, ) embedding: Optional[List[float]] = Field( default=None, description="Embedding of summary for semantic retrieval", json_schema_extra={"label": "Embedding", "db_type": "vector(1536)"}, ) @i18nModel("Workflow-Speicher") class WorkflowMemory(PowerOnModel): """Workflow-spezifischer Key-Value-Cache fuer Entitaeten und Fakten.""" id: str = Field( default_factory=lambda: str(uuid.uuid4()), description="Primary key", json_schema_extra={"label": "ID"}, ) workflowId: str = Field( description="FK to the workflow", json_schema_extra={"label": "Workflow-ID", "fk_target": {"db": "poweron_chat", "table": "ChatWorkflow", "labelField": "name"}}, ) userId: str = Field( description="Owner user ID", json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "UserInDB", "labelField": "username"}}, ) featureInstanceId: str = Field( default="", description="Feature instance scope", json_schema_extra={"label": "Feature-Instanz-ID", "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}}, ) key: str = Field( description="Key identifier (e.g. 'entity:companyName')", json_schema_extra={"label": "Schluessel"}, ) value: str = Field( description="Extracted value", json_schema_extra={"label": "Wert"}, ) source: str = Field( default="extraction", description="Origin: extraction, tool, conversation, summary", json_schema_extra={"label": "Quelle"}, ) embedding: Optional[List[float]] = Field( default=None, description="Optional embedding for semantic lookup", json_schema_extra={"label": "Embedding", "db_type": "vector(1536)"}, )