gateway/modules/datamodels/datamodelKnowledge.py
2026-03-15 23:38:21 +01:00

130 lines
6.8 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Knowledge Store data models: FileContentIndex, ContentChunk, WorkflowMemory.
These models support the 3-tier RAG architecture:
- Shared Layer: mandateId-scoped, isShared=True
- Instance Layer: userId + featureInstanceId-scoped
- Workflow Layer: workflowId-scoped (WorkflowMemory)
Vector fields use json_schema_extra={"db_type": "vector(1536)"} for pgvector.
"""
from typing import Dict, Any, List, Optional
from pydantic import BaseModel, Field
from modules.shared.attributeUtils import registerModelLabels
from modules.shared.timeUtils import getUtcTimestamp
import uuid
class FileContentIndex(BaseModel):
"""Structural index of a file's content objects. Created without AI.
Lives in the Instance Layer; optionally promoted to Shared Layer via isShared."""
id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key (typically = fileId)")
userId: str = Field(description="Owner user ID")
featureInstanceId: str = Field(default="", description="Feature instance scope")
mandateId: str = Field(default="", description="Mandate scope")
isShared: bool = Field(default=False, description="Visible in Shared Layer for all mandate users")
fileName: str = Field(description="Original file name")
mimeType: str = Field(description="MIME type of the file")
containerPath: Optional[str] = Field(default=None, description="Path within a container (e.g. 'archive.zip/folder/report.pdf')")
totalObjects: int = Field(default=0, description="Total number of content objects extracted")
totalSize: int = Field(default=0, description="Total size of all content objects in bytes")
structure: Dict[str, Any] = Field(default_factory=dict, description="Structural overview (pages, sections, hierarchy)")
objectSummary: List[Dict[str, Any]] = Field(default_factory=list, description="Compact summary per content object")
extractedAt: float = Field(default_factory=getUtcTimestamp, description="Extraction timestamp")
status: str = Field(default="pending", description="Processing status: pending, extracted, embedding, indexed, failed")
registerModelLabels(
"FileContentIndex",
{"en": "File Content Index", "fr": "Index du contenu de fichier"},
{
"id": {"en": "ID", "fr": "ID"},
"userId": {"en": "User ID", "fr": "ID utilisateur"},
"featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
"mandateId": {"en": "Mandate ID", "fr": "ID du mandat"},
"isShared": {"en": "Shared", "fr": "Partagé"},
"fileName": {"en": "File Name", "fr": "Nom de fichier"},
"mimeType": {"en": "MIME Type", "fr": "Type MIME"},
"containerPath": {"en": "Container Path", "fr": "Chemin du conteneur"},
"totalObjects": {"en": "Total Objects", "fr": "Nombre total d'objets"},
"totalSize": {"en": "Total Size", "fr": "Taille totale"},
"structure": {"en": "Structure", "fr": "Structure"},
"objectSummary": {"en": "Object Summary", "fr": "Résumé des objets"},
"extractedAt": {"en": "Extracted At", "fr": "Extrait le"},
"status": {"en": "Status", "fr": "Statut"},
},
)
class ContentChunk(BaseModel):
"""Persisted content chunk with embedding vector. Reusable across workflows.
Scalar content object (or chunk thereof) with pgvector embedding."""
id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
contentObjectId: str = Field(description="Reference to the content object within FileContentIndex")
fileId: str = Field(description="FK to the source file")
userId: str = Field(description="Owner user ID")
featureInstanceId: str = Field(default="", description="Feature instance scope")
contentType: str = Field(description="Content type: text, image, videostream, audiostream, other")
data: str = Field(description="Content data (text, base64, URL)")
contextRef: Dict[str, Any] = Field(default_factory=dict, description="Context reference (page, position, label)")
summary: Optional[str] = Field(default=None, description="AI-generated summary (on demand)")
chunkMetadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
embedding: Optional[List[float]] = Field(
default=None, description="pgvector embedding (NOT NULL for text chunks)",
json_schema_extra={"db_type": "vector(1536)"}
)
registerModelLabels(
"ContentChunk",
{"en": "Content Chunk", "fr": "Fragment de contenu"},
{
"id": {"en": "ID", "fr": "ID"},
"contentObjectId": {"en": "Content Object ID", "fr": "ID de l'objet de contenu"},
"fileId": {"en": "File ID", "fr": "ID du fichier"},
"userId": {"en": "User ID", "fr": "ID utilisateur"},
"featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
"contentType": {"en": "Content Type", "fr": "Type de contenu"},
"data": {"en": "Data", "fr": "Données"},
"contextRef": {"en": "Context Reference", "fr": "Référence contextuelle"},
"summary": {"en": "Summary", "fr": "Résumé"},
"chunkMetadata": {"en": "Metadata", "fr": "Métadonnées"},
"embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"},
},
)
class WorkflowMemory(BaseModel):
"""Workflow-scoped key-value cache for entities and facts.
Extracted during agent rounds, persisted for cross-round and cross-workflow reuse."""
id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
workflowId: str = Field(description="FK to the workflow")
userId: str = Field(description="Owner user ID")
featureInstanceId: str = Field(default="", description="Feature instance scope")
key: str = Field(description="Key identifier (e.g. 'entity:companyName')")
value: str = Field(description="Extracted value")
source: str = Field(default="extraction", description="Origin: extraction, tool, conversation, summary")
createdAt: float = Field(default_factory=getUtcTimestamp, description="Creation timestamp")
embedding: Optional[List[float]] = Field(
default=None, description="Optional embedding for semantic lookup",
json_schema_extra={"db_type": "vector(1536)"}
)
registerModelLabels(
"WorkflowMemory",
{"en": "Workflow Memory", "fr": "Mémoire de workflow"},
{
"id": {"en": "ID", "fr": "ID"},
"workflowId": {"en": "Workflow ID", "fr": "ID du workflow"},
"userId": {"en": "User ID", "fr": "ID utilisateur"},
"featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
"key": {"en": "Key", "fr": "Clé"},
"value": {"en": "Value", "fr": "Valeur"},
"source": {"en": "Source", "fr": "Source"},
"createdAt": {"en": "Created At", "fr": "Créé le"},
"embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"},
},
)