gateway/modules/datamodels/datamodelKnowledge.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Knowledge Store data models: FileContentIndex, ContentChunk, WorkflowMemory.

These models support the 3-tier RAG architecture:
- Shared Layer: mandateId-scoped, isShared=True
- Instance Layer: userId + featureInstanceId-scoped
- Workflow Layer: workflowId-scoped (WorkflowMemory)

Vector fields use json_schema_extra={"db_type": "vector(1536)"} for pgvector.
"""

from typing import Dict, Any, List, Optional
from pydantic import BaseModel, Field
from modules.shared.attributeUtils import registerModelLabels
from modules.shared.timeUtils import getUtcTimestamp
import uuid


class FileContentIndex(BaseModel):
    """Structural index of a file's content objects. Created without AI.
    Lives in the Instance Layer; optionally promoted to Shared Layer via isShared."""
    id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key (typically = fileId)")
    userId: str = Field(description="Owner user ID")
    featureInstanceId: str = Field(default="", description="Feature instance scope")
    mandateId: str = Field(default="", description="Mandate scope")
    isShared: bool = Field(default=False, description="Visible in Shared Layer for all mandate users")
    fileName: str = Field(description="Original file name")
    mimeType: str = Field(description="MIME type of the file")
    containerPath: Optional[str] = Field(default=None, description="Path within a container (e.g. 'archive.zip/folder/report.pdf')")
    totalObjects: int = Field(default=0, description="Total number of content objects extracted")
    totalSize: int = Field(default=0, description="Total size of all content objects in bytes")
    structure: Dict[str, Any] = Field(default_factory=dict, description="Structural overview (pages, sections, hierarchy)")
    objectSummary: List[Dict[str, Any]] = Field(default_factory=list, description="Compact summary per content object")
    extractedAt: float = Field(default_factory=getUtcTimestamp, description="Extraction timestamp")
    status: str = Field(default="pending", description="Processing status: pending, extracted, embedding, indexed, failed")


registerModelLabels(
    "FileContentIndex",
    {"en": "File Content Index", "fr": "Index du contenu de fichier"},
    {
        "id": {"en": "ID", "fr": "ID"},
        "userId": {"en": "User ID", "fr": "ID utilisateur"},
        "featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
        "mandateId": {"en": "Mandate ID", "fr": "ID du mandat"},
        "isShared": {"en": "Shared", "fr": "Partagé"},
        "fileName": {"en": "File Name", "fr": "Nom de fichier"},
        "mimeType": {"en": "MIME Type", "fr": "Type MIME"},
        "containerPath": {"en": "Container Path", "fr": "Chemin du conteneur"},
        "totalObjects": {"en": "Total Objects", "fr": "Nombre total d'objets"},
        "totalSize": {"en": "Total Size", "fr": "Taille totale"},
        "structure": {"en": "Structure", "fr": "Structure"},
        "objectSummary": {"en": "Object Summary", "fr": "Résumé des objets"},
        "extractedAt": {"en": "Extracted At", "fr": "Extrait le"},
        "status": {"en": "Status", "fr": "Statut"},
    },
)


class ContentChunk(BaseModel):
    """Persisted content chunk with embedding vector. Reusable across workflows.
    Scalar content object (or chunk thereof) with pgvector embedding."""
    id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
    contentObjectId: str = Field(description="Reference to the content object within FileContentIndex")
    fileId: str = Field(description="FK to the source file")
    userId: str = Field(description="Owner user ID")
    featureInstanceId: str = Field(default="", description="Feature instance scope")
    contentType: str = Field(description="Content type: text, image, videostream, audiostream, other")
    data: str = Field(description="Content data (text, base64, URL)")
    contextRef: Dict[str, Any] = Field(default_factory=dict, description="Context reference (page, position, label)")
    summary: Optional[str] = Field(default=None, description="AI-generated summary (on demand)")
    chunkMetadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
    embedding: Optional[List[float]] = Field(
        default=None, description="pgvector embedding (NOT NULL for text chunks)",
        json_schema_extra={"db_type": "vector(1536)"}
    )


registerModelLabels(
    "ContentChunk",
    {"en": "Content Chunk", "fr": "Fragment de contenu"},
    {
        "id": {"en": "ID", "fr": "ID"},
        "contentObjectId": {"en": "Content Object ID", "fr": "ID de l'objet de contenu"},
        "fileId": {"en": "File ID", "fr": "ID du fichier"},
        "userId": {"en": "User ID", "fr": "ID utilisateur"},
        "featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
        "contentType": {"en": "Content Type", "fr": "Type de contenu"},
        "data": {"en": "Data", "fr": "Données"},
        "contextRef": {"en": "Context Reference", "fr": "Référence contextuelle"},
        "summary": {"en": "Summary", "fr": "Résumé"},
        "chunkMetadata": {"en": "Metadata", "fr": "Métadonnées"},
        "embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"},
    },
)


class WorkflowMemory(BaseModel):
    """Workflow-scoped key-value cache for entities and facts.
    Extracted during agent rounds, persisted for cross-round and cross-workflow reuse."""
    id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
    workflowId: str = Field(description="FK to the workflow")
    userId: str = Field(description="Owner user ID")
    featureInstanceId: str = Field(default="", description="Feature instance scope")
    key: str = Field(description="Key identifier (e.g. 'entity:companyName')")
    value: str = Field(description="Extracted value")
    source: str = Field(default="extraction", description="Origin: extraction, tool, conversation, summary")
    createdAt: float = Field(default_factory=getUtcTimestamp, description="Creation timestamp")
    embedding: Optional[List[float]] = Field(
        default=None, description="Optional embedding for semantic lookup",
        json_schema_extra={"db_type": "vector(1536)"}
    )


registerModelLabels(
    "WorkflowMemory",
    {"en": "Workflow Memory", "fr": "Mémoire de workflow"},
    {
        "id": {"en": "ID", "fr": "ID"},
        "workflowId": {"en": "Workflow ID", "fr": "ID du workflow"},
        "userId": {"en": "User ID", "fr": "ID utilisateur"},
        "featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
        "key": {"en": "Key", "fr": "Clé"},
        "value": {"en": "Value", "fr": "Valeur"},
        "source": {"en": "Source", "fr": "Source"},
        "createdAt": {"en": "Created At", "fr": "Créé le"},
        "embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"},
    },
)