gateway/modules/datamodels/datamodelKnowledge.py
2026-04-16 23:13:05 +02:00

245 lines
9 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Knowledge Store data models: FileContentIndex, ContentChunk, WorkflowMemory.
These models support the 3-tier RAG architecture:
- Personal Layer: scope=personal, userId-scoped
- Instance Layer: scope=featureInstance, featureInstanceId-scoped
- Mandate Layer: scope=mandate, mandateId-scoped (visible to all mandate users)
- Global Layer: scope=global (sysAdmin only)
- Workflow Layer: workflowId-scoped (WorkflowMemory)
Vector fields use json_schema_extra={"db_type": "vector(1536)"} for pgvector.
"""
from typing import Dict, Any, List, Optional
from pydantic import BaseModel, Field
from modules.datamodels.datamodelBase import PowerOnModel
from modules.shared.i18nRegistry import i18nModel
from modules.shared.timeUtils import getUtcTimestamp
import uuid
@i18nModel("Datei-Inhaltsindex")
class FileContentIndex(PowerOnModel):
"""Struktureller Index der Inhaltsobjekte einer Datei."""
id: str = Field(
default_factory=lambda: str(uuid.uuid4()),
description="Primary key (typically = fileId)",
json_schema_extra={"label": "ID"},
)
userId: str = Field(
description="Owner user ID",
json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "User"}},
)
featureInstanceId: str = Field(
default="",
description="Feature instance scope",
json_schema_extra={"label": "Feature-Instanz-ID", "fk_target": {"db": "poweron_app", "table": "FeatureInstance"}},
)
mandateId: str = Field(
default="",
description="Mandate scope",
json_schema_extra={"label": "Mandanten-ID", "fk_target": {"db": "poweron_app", "table": "Mandate"}},
)
fileName: str = Field(
description="Original file name",
json_schema_extra={"label": "Dateiname"},
)
mimeType: str = Field(
description="MIME type of the file",
json_schema_extra={"label": "MIME-Typ"},
)
containerPath: Optional[str] = Field(
default=None,
description="Path within a container (e.g. 'archive.zip/folder/report.pdf')",
json_schema_extra={"label": "Container-Pfad"},
)
totalObjects: int = Field(
default=0,
description="Total number of content objects extracted",
json_schema_extra={"label": "Anzahl Objekte"},
)
totalSize: int = Field(
default=0,
description="Total size of all content objects in bytes",
json_schema_extra={"label": "Gesamtgroesse"},
)
structure: Dict[str, Any] = Field(
default_factory=dict,
description="Structural overview (pages, sections, hierarchy)",
json_schema_extra={"label": "Struktur"},
)
objectSummary: List[Dict[str, Any]] = Field(
default_factory=list,
description="Compact summary per content object",
json_schema_extra={"label": "Objekt-Zusammenfassung"},
)
extractedAt: float = Field(
default_factory=getUtcTimestamp,
description="Extraction timestamp",
json_schema_extra={"label": "Extrahiert am"},
)
status: str = Field(
default="pending",
description="Processing status: pending, extracted, embedding, indexed, failed",
json_schema_extra={"label": "Status"},
)
scope: str = Field(
default="personal",
description="Data visibility scope: personal, featureInstance, mandate, global",
json_schema_extra={"label": "Sichtbarkeit"},
)
neutralizationStatus: Optional[str] = Field(
default=None,
description="Neutralization status: completed, failed, skipped, None = not required",
json_schema_extra={"label": "Neutralisierungsstatus"},
)
isNeutralized: bool = Field(
default=False,
description="True if content was neutralized before indexing",
json_schema_extra={"label": "Neutralisiert"},
)
@i18nModel("Inhalts-Chunk")
class ContentChunk(PowerOnModel):
"""Persistierter Inhalts-Chunk mit Embedding-Vektor."""
id: str = Field(
default_factory=lambda: str(uuid.uuid4()),
description="Primary key",
json_schema_extra={"label": "ID"},
)
contentObjectId: str = Field(
description="Reference to the content object within FileContentIndex",
json_schema_extra={"label": "Inhaltsobjekt-ID"},
)
fileId: str = Field(
description="FK to the source file",
json_schema_extra={"label": "Datei-ID", "fk_target": {"db": "poweron_management", "table": "FileItem"}},
)
userId: str = Field(
description="Owner user ID",
json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "User"}},
)
featureInstanceId: str = Field(
default="",
description="Feature instance scope",
json_schema_extra={"label": "Feature-Instanz-ID", "fk_target": {"db": "poweron_app", "table": "FeatureInstance"}},
)
contentType: str = Field(
description="Content type: text, image, videostream, audiostream, other",
json_schema_extra={"label": "Inhaltstyp"},
)
data: str = Field(
description="Content data (text, base64, URL)",
json_schema_extra={"label": "Daten"},
)
contextRef: Dict[str, Any] = Field(
default_factory=dict,
description="Context reference (page, position, label)",
json_schema_extra={"label": "Kontext-Referenz"},
)
summary: Optional[str] = Field(
default=None,
description="AI-generated summary (on demand)",
json_schema_extra={"label": "Zusammenfassung"},
)
chunkMetadata: Dict[str, Any] = Field(
default_factory=dict,
description="Additional metadata",
json_schema_extra={"label": "Metadaten"},
)
embedding: Optional[List[float]] = Field(
default=None,
description="pgvector embedding (NOT NULL for text chunks)",
json_schema_extra={"label": "Embedding", "db_type": "vector(1536)"},
)
@i18nModel("Runden-Speicher")
class RoundMemory(PowerOnModel):
"""Persistenter Speicher pro Agenten-Runde."""
id: str = Field(
default_factory=lambda: str(uuid.uuid4()),
description="Primary key",
json_schema_extra={"label": "ID"},
)
workflowId: str = Field(
description="FK to the workflow",
json_schema_extra={"label": "Workflow-ID"},
)
roundNumber: int = Field(
default=0,
description="Agent round that produced this memory",
json_schema_extra={"label": "Rundennummer"},
)
memoryType: str = Field(
description="Category: file_ref, tool_result, decision, data_source_ref",
json_schema_extra={"label": "Speichertyp"},
)
key: str = Field(
description="Dedup key, e.g. 'readFile:<fileId>' or 'plan'",
json_schema_extra={"label": "Schluessel"},
)
summary: str = Field(
default="",
description="Compact summary (max ~2000 chars)",
json_schema_extra={"label": "Zusammenfassung"},
)
fullData: Optional[str] = Field(
default=None,
description="Full tool output when small enough (max ~8000 chars)",
json_schema_extra={"label": "Volldaten"},
)
fileIds: List[str] = Field(
default_factory=list,
description="Referenced file IDs",
json_schema_extra={"label": "Datei-IDs"},
)
embedding: Optional[List[float]] = Field(
default=None,
description="Embedding of summary for semantic retrieval",
json_schema_extra={"label": "Embedding", "db_type": "vector(1536)"},
)
@i18nModel("Workflow-Speicher")
class WorkflowMemory(PowerOnModel):
"""Workflow-spezifischer Key-Value-Cache fuer Entitaeten und Fakten."""
id: str = Field(
default_factory=lambda: str(uuid.uuid4()),
description="Primary key",
json_schema_extra={"label": "ID"},
)
workflowId: str = Field(
description="FK to the workflow",
json_schema_extra={"label": "Workflow-ID", "fk_target": {"db": "poweron_chat", "table": "ChatWorkflow"}},
)
userId: str = Field(
description="Owner user ID",
json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "User"}},
)
featureInstanceId: str = Field(
default="",
description="Feature instance scope",
json_schema_extra={"label": "Feature-Instanz-ID", "fk_target": {"db": "poweron_app", "table": "FeatureInstance"}},
)
key: str = Field(
description="Key identifier (e.g. 'entity:companyName')",
json_schema_extra={"label": "Schluessel"},
)
value: str = Field(
description="Extracted value",
json_schema_extra={"label": "Wert"},
)
source: str = Field(
default="extraction",
description="Origin: extraction, tool, conversation, summary",
json_schema_extra={"label": "Quelle"},
)
embedding: Optional[List[float]] = Field(
default=None,
description="Optional embedding for semantic lookup",
json_schema_extra={"label": "Embedding", "db_type": "vector(1536)"},
)