gateway/modules/datamodels/datamodelKnowledge.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Knowledge Store data models: FileContentIndex, ContentChunk, WorkflowMemory.

These models support the 3-tier RAG architecture:
- Shared Layer: mandateId-scoped, isShared=True
- Instance Layer: userId + featureInstanceId-scoped
- Workflow Layer: workflowId-scoped (WorkflowMemory)

Vector fields use json_schema_extra={"db_type": "vector(1536)"} for pgvector.
"""

from typing import Dict, Any, List, Optional
from pydantic import BaseModel, Field
from modules.datamodels.datamodelBase import PowerOnModel
from modules.shared.attributeUtils import registerModelLabels
from modules.shared.timeUtils import getUtcTimestamp
import uuid


class FileContentIndex(PowerOnModel):
    """Structural index of a file's content objects. Created without AI.
    Lives in the Instance Layer; optionally promoted to Shared Layer via isShared."""
    id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key (typically = fileId)")
    userId: str = Field(description="Owner user ID")
    featureInstanceId: str = Field(default="", description="Feature instance scope")
    mandateId: str = Field(default="", description="Mandate scope")
    isShared: bool = Field(default=False, description="Visible in Shared Layer for all mandate users")
    fileName: str = Field(description="Original file name")
    mimeType: str = Field(description="MIME type of the file")
    containerPath: Optional[str] = Field(default=None, description="Path within a container (e.g. 'archive.zip/folder/report.pdf')")
    totalObjects: int = Field(default=0, description="Total number of content objects extracted")
    totalSize: int = Field(default=0, description="Total size of all content objects in bytes")
    structure: Dict[str, Any] = Field(default_factory=dict, description="Structural overview (pages, sections, hierarchy)")
    objectSummary: List[Dict[str, Any]] = Field(default_factory=list, description="Compact summary per content object")
    extractedAt: float = Field(default_factory=getUtcTimestamp, description="Extraction timestamp")
    status: str = Field(default="pending", description="Processing status: pending, extracted, embedding, indexed, failed")
    scope: str = Field(
        default="personal",
        description="Data visibility scope: personal, featureInstance, mandate, global",
    )
    neutralizationStatus: Optional[str] = Field(
        default=None,
        description="Neutralization status: completed, failed, skipped, None = not required",
    )
    isNeutralized: bool = Field(
        default=False,
        description="True if content was neutralized before indexing",
    )


registerModelLabels(
    "FileContentIndex",
    {"en": "File Content Index", "fr": "Index du contenu de fichier"},
    {
        "id": {"en": "ID", "fr": "ID"},
        "userId": {"en": "User ID", "fr": "ID utilisateur"},
        "featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
        "mandateId": {"en": "Mandate ID", "fr": "ID du mandat"},
        "isShared": {"en": "Shared", "fr": "Partagé"},
        "fileName": {"en": "File Name", "fr": "Nom de fichier"},
        "mimeType": {"en": "MIME Type", "fr": "Type MIME"},
        "containerPath": {"en": "Container Path", "fr": "Chemin du conteneur"},
        "totalObjects": {"en": "Total Objects", "fr": "Nombre total d'objets"},
        "totalSize": {"en": "Total Size", "fr": "Taille totale"},
        "structure": {"en": "Structure", "fr": "Structure"},
        "objectSummary": {"en": "Object Summary", "fr": "Résumé des objets"},
        "extractedAt": {"en": "Extracted At", "fr": "Extrait le"},
        "status": {"en": "Status", "fr": "Statut"},
        "scope": {"en": "Scope", "de": "Sichtbarkeit"},
        "neutralizationStatus": {"en": "Neutralization Status", "de": "Neutralisierungsstatus"},
        "isNeutralized": {"en": "Is Neutralized", "de": "Neutralisiert"},
    },
)


class ContentChunk(PowerOnModel):
    """Persisted content chunk with embedding vector. Reusable across workflows.
    Scalar content object (or chunk thereof) with pgvector embedding."""
    id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
    contentObjectId: str = Field(description="Reference to the content object within FileContentIndex")
    fileId: str = Field(description="FK to the source file")
    userId: str = Field(description="Owner user ID")
    featureInstanceId: str = Field(default="", description="Feature instance scope")
    contentType: str = Field(description="Content type: text, image, videostream, audiostream, other")
    data: str = Field(description="Content data (text, base64, URL)")
    contextRef: Dict[str, Any] = Field(default_factory=dict, description="Context reference (page, position, label)")
    summary: Optional[str] = Field(default=None, description="AI-generated summary (on demand)")
    chunkMetadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
    embedding: Optional[List[float]] = Field(
        default=None, description="pgvector embedding (NOT NULL for text chunks)",
        json_schema_extra={"db_type": "vector(1536)"}
    )


registerModelLabels(
    "ContentChunk",
    {"en": "Content Chunk", "fr": "Fragment de contenu"},
    {
        "id": {"en": "ID", "fr": "ID"},
        "contentObjectId": {"en": "Content Object ID", "fr": "ID de l'objet de contenu"},
        "fileId": {"en": "File ID", "fr": "ID du fichier"},
        "userId": {"en": "User ID", "fr": "ID utilisateur"},
        "featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
        "contentType": {"en": "Content Type", "fr": "Type de contenu"},
        "data": {"en": "Data", "fr": "Données"},
        "contextRef": {"en": "Context Reference", "fr": "Référence contextuelle"},
        "summary": {"en": "Summary", "fr": "Résumé"},
        "chunkMetadata": {"en": "Metadata", "fr": "Métadonnées"},
        "embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"},
    },
)


class RoundMemory(PowerOnModel):
    """Persistent per-round memory for agent tool results, file refs, and decisions.

    Stored after each agent round so that RAG can retrieve relevant context
    even after the ConversationManager summarises older messages away.
    """
    id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
    workflowId: str = Field(description="FK to the workflow")
    roundNumber: int = Field(default=0, description="Agent round that produced this memory")
    memoryType: str = Field(
        description="Category: file_ref, tool_result, decision, data_source_ref"
    )
    key: str = Field(description="Dedup key, e.g. 'readFile:<fileId>' or 'plan'")
    summary: str = Field(default="", description="Compact summary (max ~2000 chars)")
    fullData: Optional[str] = Field(
        default=None,
        description="Full tool output when small enough (max ~8000 chars)",
    )
    fileIds: List[str] = Field(default_factory=list, description="Referenced file IDs")
    embedding: Optional[List[float]] = Field(
        default=None,
        description="Embedding of summary for semantic retrieval",
        json_schema_extra={"db_type": "vector(1536)"},
    )


registerModelLabels(
    "RoundMemory",
    {"en": "Round Memory", "fr": "Mémoire de tour"},
    {
        "id": {"en": "ID", "fr": "ID"},
        "workflowId": {"en": "Workflow ID", "fr": "ID du workflow"},
        "roundNumber": {"en": "Round Number", "fr": "Numéro de tour"},
        "memoryType": {"en": "Memory Type", "fr": "Type de mémoire"},
        "key": {"en": "Key", "fr": "Clé"},
        "summary": {"en": "Summary", "fr": "Résumé"},
        "fullData": {"en": "Full Data", "fr": "Données complètes"},
        "fileIds": {"en": "File IDs", "fr": "IDs de fichier"},
        "embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"},
    },
)


class WorkflowMemory(PowerOnModel):
    """Workflow-scoped key-value cache for entities and facts.
    Extracted during agent rounds, persisted for cross-round and cross-workflow reuse."""
    id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
    workflowId: str = Field(description="FK to the workflow")
    userId: str = Field(description="Owner user ID")
    featureInstanceId: str = Field(default="", description="Feature instance scope")
    key: str = Field(description="Key identifier (e.g. 'entity:companyName')")
    value: str = Field(description="Extracted value")
    source: str = Field(default="extraction", description="Origin: extraction, tool, conversation, summary")
    embedding: Optional[List[float]] = Field(
        default=None, description="Optional embedding for semantic lookup",
        json_schema_extra={"db_type": "vector(1536)"}
    )


registerModelLabels(
    "WorkflowMemory",
    {"en": "Workflow Memory", "fr": "Mémoire de workflow"},
    {
        "id": {"en": "ID", "fr": "ID"},
        "workflowId": {"en": "Workflow ID", "fr": "ID du workflow"},
        "userId": {"en": "User ID", "fr": "ID utilisateur"},
        "featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
        "key": {"en": "Key", "fr": "Clé"},
        "value": {"en": "Value", "fr": "Valeur"},
        "source": {"en": "Source", "fr": "Source"},
        "embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"},
    },
)