API and persisted records use PowerOnModel system fields: - sysCreatedAt, sysCreatedBy, sysModifiedAt, sysModifiedBy Removed legacy JSON/DB field names: - _createdAt, _createdBy, _modifiedAt, _modifiedBy Frontend (frontend_nyla) and gateway call sites were updated accordingly. Database: - Bootstrap runs idempotent backfill (_migrateSystemFieldColumns) from old underscore columns and selected business duplicates into sys* where sys* IS NULL. - Re-run app bootstrap against each PostgreSQL database after deploy. - Optional: DROP INDEX IF EXISTS "idx_invitation_createdby" if an old index remains; new index: idx_invitation_syscreatedby on Invitation(sysCreatedBy). Tests: - RBAC integration tests aligned with current GROUP mandate filter and UserMandate-based UserConnection GROUP clause; buildRbacWhereClause(..., mandateId=...) must be passed explicitly (same as production request context).
187 lines
9.3 KiB
Python
187 lines
9.3 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Knowledge Store data models: FileContentIndex, ContentChunk, WorkflowMemory.
|
|
|
|
These models support the 3-tier RAG architecture:
|
|
- Shared Layer: mandateId-scoped, isShared=True
|
|
- Instance Layer: userId + featureInstanceId-scoped
|
|
- Workflow Layer: workflowId-scoped (WorkflowMemory)
|
|
|
|
Vector fields use json_schema_extra={"db_type": "vector(1536)"} for pgvector.
|
|
"""
|
|
|
|
from typing import Dict, Any, List, Optional
|
|
from pydantic import BaseModel, Field
|
|
from modules.datamodels.datamodelBase import PowerOnModel
|
|
from modules.shared.attributeUtils import registerModelLabels
|
|
from modules.shared.timeUtils import getUtcTimestamp
|
|
import uuid
|
|
|
|
|
|
class FileContentIndex(PowerOnModel):
|
|
"""Structural index of a file's content objects. Created without AI.
|
|
Lives in the Instance Layer; optionally promoted to Shared Layer via isShared."""
|
|
id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key (typically = fileId)")
|
|
userId: str = Field(description="Owner user ID")
|
|
featureInstanceId: str = Field(default="", description="Feature instance scope")
|
|
mandateId: str = Field(default="", description="Mandate scope")
|
|
isShared: bool = Field(default=False, description="Visible in Shared Layer for all mandate users")
|
|
fileName: str = Field(description="Original file name")
|
|
mimeType: str = Field(description="MIME type of the file")
|
|
containerPath: Optional[str] = Field(default=None, description="Path within a container (e.g. 'archive.zip/folder/report.pdf')")
|
|
totalObjects: int = Field(default=0, description="Total number of content objects extracted")
|
|
totalSize: int = Field(default=0, description="Total size of all content objects in bytes")
|
|
structure: Dict[str, Any] = Field(default_factory=dict, description="Structural overview (pages, sections, hierarchy)")
|
|
objectSummary: List[Dict[str, Any]] = Field(default_factory=list, description="Compact summary per content object")
|
|
extractedAt: float = Field(default_factory=getUtcTimestamp, description="Extraction timestamp")
|
|
status: str = Field(default="pending", description="Processing status: pending, extracted, embedding, indexed, failed")
|
|
scope: str = Field(
|
|
default="personal",
|
|
description="Data visibility scope: personal, featureInstance, mandate, global",
|
|
)
|
|
neutralizationStatus: Optional[str] = Field(
|
|
default=None,
|
|
description="Neutralization status: completed, failed, skipped, None = not required",
|
|
)
|
|
isNeutralized: bool = Field(
|
|
default=False,
|
|
description="True if content was neutralized before indexing",
|
|
)
|
|
|
|
|
|
registerModelLabels(
|
|
"FileContentIndex",
|
|
{"en": "File Content Index", "fr": "Index du contenu de fichier"},
|
|
{
|
|
"id": {"en": "ID", "fr": "ID"},
|
|
"userId": {"en": "User ID", "fr": "ID utilisateur"},
|
|
"featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
|
|
"mandateId": {"en": "Mandate ID", "fr": "ID du mandat"},
|
|
"isShared": {"en": "Shared", "fr": "Partagé"},
|
|
"fileName": {"en": "File Name", "fr": "Nom de fichier"},
|
|
"mimeType": {"en": "MIME Type", "fr": "Type MIME"},
|
|
"containerPath": {"en": "Container Path", "fr": "Chemin du conteneur"},
|
|
"totalObjects": {"en": "Total Objects", "fr": "Nombre total d'objets"},
|
|
"totalSize": {"en": "Total Size", "fr": "Taille totale"},
|
|
"structure": {"en": "Structure", "fr": "Structure"},
|
|
"objectSummary": {"en": "Object Summary", "fr": "Résumé des objets"},
|
|
"extractedAt": {"en": "Extracted At", "fr": "Extrait le"},
|
|
"status": {"en": "Status", "fr": "Statut"},
|
|
"scope": {"en": "Scope", "de": "Sichtbarkeit"},
|
|
"neutralizationStatus": {"en": "Neutralization Status", "de": "Neutralisierungsstatus"},
|
|
"isNeutralized": {"en": "Is Neutralized", "de": "Neutralisiert"},
|
|
},
|
|
)
|
|
|
|
|
|
class ContentChunk(PowerOnModel):
|
|
"""Persisted content chunk with embedding vector. Reusable across workflows.
|
|
Scalar content object (or chunk thereof) with pgvector embedding."""
|
|
id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
|
|
contentObjectId: str = Field(description="Reference to the content object within FileContentIndex")
|
|
fileId: str = Field(description="FK to the source file")
|
|
userId: str = Field(description="Owner user ID")
|
|
featureInstanceId: str = Field(default="", description="Feature instance scope")
|
|
contentType: str = Field(description="Content type: text, image, videostream, audiostream, other")
|
|
data: str = Field(description="Content data (text, base64, URL)")
|
|
contextRef: Dict[str, Any] = Field(default_factory=dict, description="Context reference (page, position, label)")
|
|
summary: Optional[str] = Field(default=None, description="AI-generated summary (on demand)")
|
|
chunkMetadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
|
|
embedding: Optional[List[float]] = Field(
|
|
default=None, description="pgvector embedding (NOT NULL for text chunks)",
|
|
json_schema_extra={"db_type": "vector(1536)"}
|
|
)
|
|
|
|
|
|
registerModelLabels(
|
|
"ContentChunk",
|
|
{"en": "Content Chunk", "fr": "Fragment de contenu"},
|
|
{
|
|
"id": {"en": "ID", "fr": "ID"},
|
|
"contentObjectId": {"en": "Content Object ID", "fr": "ID de l'objet de contenu"},
|
|
"fileId": {"en": "File ID", "fr": "ID du fichier"},
|
|
"userId": {"en": "User ID", "fr": "ID utilisateur"},
|
|
"featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
|
|
"contentType": {"en": "Content Type", "fr": "Type de contenu"},
|
|
"data": {"en": "Data", "fr": "Données"},
|
|
"contextRef": {"en": "Context Reference", "fr": "Référence contextuelle"},
|
|
"summary": {"en": "Summary", "fr": "Résumé"},
|
|
"chunkMetadata": {"en": "Metadata", "fr": "Métadonnées"},
|
|
"embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"},
|
|
},
|
|
)
|
|
|
|
|
|
class RoundMemory(PowerOnModel):
|
|
"""Persistent per-round memory for agent tool results, file refs, and decisions.
|
|
|
|
Stored after each agent round so that RAG can retrieve relevant context
|
|
even after the ConversationManager summarises older messages away.
|
|
"""
|
|
id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
|
|
workflowId: str = Field(description="FK to the workflow")
|
|
roundNumber: int = Field(default=0, description="Agent round that produced this memory")
|
|
memoryType: str = Field(
|
|
description="Category: file_ref, tool_result, decision, data_source_ref"
|
|
)
|
|
key: str = Field(description="Dedup key, e.g. 'readFile:<fileId>' or 'plan'")
|
|
summary: str = Field(default="", description="Compact summary (max ~2000 chars)")
|
|
fullData: Optional[str] = Field(
|
|
default=None,
|
|
description="Full tool output when small enough (max ~8000 chars)",
|
|
)
|
|
fileIds: List[str] = Field(default_factory=list, description="Referenced file IDs")
|
|
embedding: Optional[List[float]] = Field(
|
|
default=None,
|
|
description="Embedding of summary for semantic retrieval",
|
|
json_schema_extra={"db_type": "vector(1536)"},
|
|
)
|
|
|
|
|
|
registerModelLabels(
|
|
"RoundMemory",
|
|
{"en": "Round Memory", "fr": "Mémoire de tour"},
|
|
{
|
|
"id": {"en": "ID", "fr": "ID"},
|
|
"workflowId": {"en": "Workflow ID", "fr": "ID du workflow"},
|
|
"roundNumber": {"en": "Round Number", "fr": "Numéro de tour"},
|
|
"memoryType": {"en": "Memory Type", "fr": "Type de mémoire"},
|
|
"key": {"en": "Key", "fr": "Clé"},
|
|
"summary": {"en": "Summary", "fr": "Résumé"},
|
|
"fullData": {"en": "Full Data", "fr": "Données complètes"},
|
|
"fileIds": {"en": "File IDs", "fr": "IDs de fichier"},
|
|
"embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"},
|
|
},
|
|
)
|
|
|
|
|
|
class WorkflowMemory(PowerOnModel):
|
|
"""Workflow-scoped key-value cache for entities and facts.
|
|
Extracted during agent rounds, persisted for cross-round and cross-workflow reuse."""
|
|
id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
|
|
workflowId: str = Field(description="FK to the workflow")
|
|
userId: str = Field(description="Owner user ID")
|
|
featureInstanceId: str = Field(default="", description="Feature instance scope")
|
|
key: str = Field(description="Key identifier (e.g. 'entity:companyName')")
|
|
value: str = Field(description="Extracted value")
|
|
source: str = Field(default="extraction", description="Origin: extraction, tool, conversation, summary")
|
|
embedding: Optional[List[float]] = Field(
|
|
default=None, description="Optional embedding for semantic lookup",
|
|
json_schema_extra={"db_type": "vector(1536)"}
|
|
)
|
|
|
|
|
|
registerModelLabels(
|
|
"WorkflowMemory",
|
|
{"en": "Workflow Memory", "fr": "Mémoire de workflow"},
|
|
{
|
|
"id": {"en": "ID", "fr": "ID"},
|
|
"workflowId": {"en": "Workflow ID", "fr": "ID du workflow"},
|
|
"userId": {"en": "User ID", "fr": "ID utilisateur"},
|
|
"featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
|
|
"key": {"en": "Key", "fr": "Clé"},
|
|
"value": {"en": "Value", "fr": "Valeur"},
|
|
"source": {"en": "Source", "fr": "Source"},
|
|
"embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"},
|
|
},
|
|
)
|