# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Knowledge Store data models: FileContentIndex, ContentChunk, WorkflowMemory. These models support the 3-tier RAG architecture: - Shared Layer: mandateId-scoped, isShared=True - Instance Layer: userId + featureInstanceId-scoped - Workflow Layer: workflowId-scoped (WorkflowMemory) Vector fields use json_schema_extra={"db_type": "vector(1536)"} for pgvector. """ from typing import Dict, Any, List, Optional from pydantic import BaseModel, Field from modules.shared.attributeUtils import registerModelLabels from modules.shared.timeUtils import getUtcTimestamp import uuid class FileContentIndex(BaseModel): """Structural index of a file's content objects. Created without AI. Lives in the Instance Layer; optionally promoted to Shared Layer via isShared.""" id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key (typically = fileId)") userId: str = Field(description="Owner user ID") featureInstanceId: str = Field(default="", description="Feature instance scope") mandateId: str = Field(default="", description="Mandate scope") isShared: bool = Field(default=False, description="Visible in Shared Layer for all mandate users") fileName: str = Field(description="Original file name") mimeType: str = Field(description="MIME type of the file") containerPath: Optional[str] = Field(default=None, description="Path within a container (e.g. 'archive.zip/folder/report.pdf')") totalObjects: int = Field(default=0, description="Total number of content objects extracted") totalSize: int = Field(default=0, description="Total size of all content objects in bytes") structure: Dict[str, Any] = Field(default_factory=dict, description="Structural overview (pages, sections, hierarchy)") objectSummary: List[Dict[str, Any]] = Field(default_factory=list, description="Compact summary per content object") extractedAt: float = Field(default_factory=getUtcTimestamp, description="Extraction timestamp") status: str = Field(default="pending", description="Processing status: pending, extracted, embedding, indexed, failed") registerModelLabels( "FileContentIndex", {"en": "File Content Index", "fr": "Index du contenu de fichier"}, { "id": {"en": "ID", "fr": "ID"}, "userId": {"en": "User ID", "fr": "ID utilisateur"}, "featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"}, "mandateId": {"en": "Mandate ID", "fr": "ID du mandat"}, "isShared": {"en": "Shared", "fr": "Partagé"}, "fileName": {"en": "File Name", "fr": "Nom de fichier"}, "mimeType": {"en": "MIME Type", "fr": "Type MIME"}, "containerPath": {"en": "Container Path", "fr": "Chemin du conteneur"}, "totalObjects": {"en": "Total Objects", "fr": "Nombre total d'objets"}, "totalSize": {"en": "Total Size", "fr": "Taille totale"}, "structure": {"en": "Structure", "fr": "Structure"}, "objectSummary": {"en": "Object Summary", "fr": "Résumé des objets"}, "extractedAt": {"en": "Extracted At", "fr": "Extrait le"}, "status": {"en": "Status", "fr": "Statut"}, }, ) class ContentChunk(BaseModel): """Persisted content chunk with embedding vector. Reusable across workflows. Scalar content object (or chunk thereof) with pgvector embedding.""" id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key") contentObjectId: str = Field(description="Reference to the content object within FileContentIndex") fileId: str = Field(description="FK to the source file") userId: str = Field(description="Owner user ID") featureInstanceId: str = Field(default="", description="Feature instance scope") contentType: str = Field(description="Content type: text, image, videostream, audiostream, other") data: str = Field(description="Content data (text, base64, URL)") contextRef: Dict[str, Any] = Field(default_factory=dict, description="Context reference (page, position, label)") summary: Optional[str] = Field(default=None, description="AI-generated summary (on demand)") chunkMetadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata") embedding: Optional[List[float]] = Field( default=None, description="pgvector embedding (NOT NULL for text chunks)", json_schema_extra={"db_type": "vector(1536)"} ) registerModelLabels( "ContentChunk", {"en": "Content Chunk", "fr": "Fragment de contenu"}, { "id": {"en": "ID", "fr": "ID"}, "contentObjectId": {"en": "Content Object ID", "fr": "ID de l'objet de contenu"}, "fileId": {"en": "File ID", "fr": "ID du fichier"}, "userId": {"en": "User ID", "fr": "ID utilisateur"}, "featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"}, "contentType": {"en": "Content Type", "fr": "Type de contenu"}, "data": {"en": "Data", "fr": "Données"}, "contextRef": {"en": "Context Reference", "fr": "Référence contextuelle"}, "summary": {"en": "Summary", "fr": "Résumé"}, "chunkMetadata": {"en": "Metadata", "fr": "Métadonnées"}, "embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"}, }, ) class WorkflowMemory(BaseModel): """Workflow-scoped key-value cache for entities and facts. Extracted during agent rounds, persisted for cross-round and cross-workflow reuse.""" id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key") workflowId: str = Field(description="FK to the workflow") userId: str = Field(description="Owner user ID") featureInstanceId: str = Field(default="", description="Feature instance scope") key: str = Field(description="Key identifier (e.g. 'entity:companyName')") value: str = Field(description="Extracted value") source: str = Field(default="extraction", description="Origin: extraction, tool, conversation, summary") createdAt: float = Field(default_factory=getUtcTimestamp, description="Creation timestamp") embedding: Optional[List[float]] = Field( default=None, description="Optional embedding for semantic lookup", json_schema_extra={"db_type": "vector(1536)"} ) registerModelLabels( "WorkflowMemory", {"en": "Workflow Memory", "fr": "Mémoire de workflow"}, { "id": {"en": "ID", "fr": "ID"}, "workflowId": {"en": "Workflow ID", "fr": "ID du workflow"}, "userId": {"en": "User ID", "fr": "ID utilisateur"}, "featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"}, "key": {"en": "Key", "fr": "Clé"}, "value": {"en": "Value", "fr": "Valeur"}, "source": {"en": "Source", "fr": "Source"}, "createdAt": {"en": "Created At", "fr": "Créé le"}, "embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"}, }, )