gateway/modules/datamodels/datamodelKnowledge.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Knowledge Store data models: FileContentIndex, ContentChunk, WorkflowMemory.

These models support the 3-tier RAG architecture:
- Personal Layer: scope=personal, userId-scoped
- Instance Layer: scope=featureInstance, featureInstanceId-scoped
- Mandate Layer: scope=mandate, mandateId-scoped (visible to all mandate users)
- Global Layer: scope=global (sysAdmin only)
- Workflow Layer: workflowId-scoped (WorkflowMemory)

Vector fields use json_schema_extra={"db_type": "vector(1536)"} for pgvector.
"""

from typing import Dict, Any, List, Optional
from pydantic import BaseModel, Field
from modules.datamodels.datamodelBase import PowerOnModel
from modules.shared.i18nRegistry import i18nModel
from modules.shared.timeUtils import getUtcTimestamp
import uuid


@i18nModel("Datei-Inhaltsindex")
class FileContentIndex(PowerOnModel):
    """Struktureller Index der Inhaltsobjekte einer Datei."""
    id: str = Field(
        default_factory=lambda: str(uuid.uuid4()),
        description="Primary key (typically = fileId)",
        json_schema_extra={"label": "ID"},
    )
    userId: str = Field(
        description="Owner user ID",
        json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "UserInDB", "labelField": "username"}},
    )
    featureInstanceId: str = Field(
        default="",
        description="Feature instance scope",
        json_schema_extra={"label": "Feature-Instanz-ID", "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}},
    )
    mandateId: str = Field(
        default="",
        description="Mandate scope",
        json_schema_extra={"label": "Mandanten-ID", "fk_target": {"db": "poweron_app", "table": "Mandate", "labelField": "label"}},
    )
    fileName: str = Field(
        description="Original file name",
        json_schema_extra={"label": "Dateiname"},
    )
    mimeType: str = Field(
        description="MIME type of the file",
        json_schema_extra={"label": "MIME-Typ"},
    )
    containerPath: Optional[str] = Field(
        default=None,
        description="Path within a container (e.g. 'archive.zip/folder/report.pdf')",
        json_schema_extra={"label": "Container-Pfad"},
    )
    totalObjects: int = Field(
        default=0,
        description="Total number of content objects extracted",
        json_schema_extra={"label": "Anzahl Objekte"},
    )
    totalSize: int = Field(
        default=0,
        description="Total size of all content objects in bytes",
        json_schema_extra={"label": "Gesamtgroesse"},
    )
    structure: Dict[str, Any] = Field(
        default_factory=dict,
        description="Structural overview (pages, sections, hierarchy)",
        json_schema_extra={"label": "Struktur"},
    )
    objectSummary: List[Dict[str, Any]] = Field(
        default_factory=list,
        description="Compact summary per content object",
        json_schema_extra={"label": "Objekt-Zusammenfassung"},
    )
    extractedAt: float = Field(
        default_factory=getUtcTimestamp,
        description="Extraction timestamp",
        json_schema_extra={"label": "Extrahiert am", "frontend_type": "timestamp"},
    )
    status: str = Field(
        default="pending",
        description="Processing status: pending, extracted, embedding, indexed, failed",
        json_schema_extra={"label": "Status"},
    )
    scope: str = Field(
        default="personal",
        description="Data visibility scope: personal, featureInstance, mandate, global",
        json_schema_extra={"label": "Sichtbarkeit"},
    )
    neutralizationStatus: Optional[str] = Field(
        default=None,
        description="Neutralization status: completed, failed, skipped, None = not required",
        json_schema_extra={"label": "Neutralisierungsstatus"},
    )
    isNeutralized: bool = Field(
        default=False,
        description="True if content was neutralized before indexing",
        json_schema_extra={"label": "Neutralisiert"},
    )


@i18nModel("Inhalts-Chunk")
class ContentChunk(PowerOnModel):
    """Persistierter Inhalts-Chunk mit Embedding-Vektor."""
    id: str = Field(
        default_factory=lambda: str(uuid.uuid4()),
        description="Primary key",
        json_schema_extra={"label": "ID"},
    )
    contentObjectId: str = Field(
        description="Reference to the content object within FileContentIndex",
        json_schema_extra={"label": "Inhaltsobjekt-ID"},
    )
    fileId: str = Field(
        description="FK to the source file",
        json_schema_extra={"label": "Datei-ID", "fk_target": {"db": "poweron_management", "table": "FileItem", "labelField": "fileName"}},
    )
    userId: str = Field(
        description="Owner user ID",
        json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "UserInDB", "labelField": "username"}},
    )
    featureInstanceId: str = Field(
        default="",
        description="Feature instance scope",
        json_schema_extra={"label": "Feature-Instanz-ID", "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}},
    )
    contentType: str = Field(
        description="Content type: text, image, videostream, audiostream, other",
        json_schema_extra={"label": "Inhaltstyp"},
    )
    data: str = Field(
        description="Content data (text, base64, URL)",
        json_schema_extra={"label": "Daten"},
    )
    contextRef: Dict[str, Any] = Field(
        default_factory=dict,
        description="Context reference (page, position, label)",
        json_schema_extra={"label": "Kontext-Referenz"},
    )
    summary: Optional[str] = Field(
        default=None,
        description="AI-generated summary (on demand)",
        json_schema_extra={"label": "Zusammenfassung"},
    )
    chunkMetadata: Dict[str, Any] = Field(
        default_factory=dict,
        description="Additional metadata",
        json_schema_extra={"label": "Metadaten"},
    )
    embedding: Optional[List[float]] = Field(
        default=None,
        description="pgvector embedding (NOT NULL for text chunks)",
        json_schema_extra={"label": "Embedding", "db_type": "vector(1536)"},
    )


@i18nModel("Runden-Speicher")
class RoundMemory(PowerOnModel):
    """Persistenter Speicher pro Agenten-Runde."""
    id: str = Field(
        default_factory=lambda: str(uuid.uuid4()),
        description="Primary key",
        json_schema_extra={"label": "ID"},
    )
    workflowId: str = Field(
        description="FK to the workflow",
        json_schema_extra={"label": "Workflow-ID"},
    )
    roundNumber: int = Field(
        default=0,
        description="Agent round that produced this memory",
        json_schema_extra={"label": "Rundennummer"},
    )
    memoryType: str = Field(
        description="Category: file_ref, tool_result, decision, data_source_ref",
        json_schema_extra={"label": "Speichertyp"},
    )
    key: str = Field(
        description="Dedup key, e.g. 'readFile:<fileId>' or 'plan'",
        json_schema_extra={"label": "Schluessel"},
    )
    summary: str = Field(
        default="",
        description="Compact summary (max ~2000 chars)",
        json_schema_extra={"label": "Zusammenfassung"},
    )
    fullData: Optional[str] = Field(
        default=None,
        description="Full tool output when small enough (max ~8000 chars)",
        json_schema_extra={"label": "Volldaten"},
    )
    fileIds: List[str] = Field(
        default_factory=list,
        description="Referenced file IDs",
        json_schema_extra={"label": "Datei-IDs"},
    )
    embedding: Optional[List[float]] = Field(
        default=None,
        description="Embedding of summary for semantic retrieval",
        json_schema_extra={"label": "Embedding", "db_type": "vector(1536)"},
    )


@i18nModel("Workflow-Speicher")
class WorkflowMemory(PowerOnModel):
    """Workflow-spezifischer Key-Value-Cache fuer Entitaeten und Fakten."""
    id: str = Field(
        default_factory=lambda: str(uuid.uuid4()),
        description="Primary key",
        json_schema_extra={"label": "ID"},
    )
    workflowId: str = Field(
        description="FK to the workflow",
        json_schema_extra={"label": "Workflow-ID", "fk_target": {"db": "poweron_chat", "table": "ChatWorkflow", "labelField": "name"}},
    )
    userId: str = Field(
        description="Owner user ID",
        json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "UserInDB", "labelField": "username"}},
    )
    featureInstanceId: str = Field(
        default="",
        description="Feature instance scope",
        json_schema_extra={"label": "Feature-Instanz-ID", "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}},
    )
    key: str = Field(
        description="Key identifier (e.g. 'entity:companyName')",
        json_schema_extra={"label": "Schluessel"},
    )
    value: str = Field(
        description="Extracted value",
        json_schema_extra={"label": "Wert"},
    )
    source: str = Field(
        default="extraction",
        description="Origin: extraction, tool, conversation, summary",
        json_schema_extra={"label": "Quelle"},
    )
    embedding: Optional[List[float]] = Field(
        default=None,
        description="Optional embedding for semantic lookup",
        json_schema_extra={"label": "Embedding", "db_type": "vector(1536)"},
    )