245 lines
9.3 KiB
Python
245 lines
9.3 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Knowledge Store data models: FileContentIndex, ContentChunk, WorkflowMemory.
|
|
|
|
These models support the 3-tier RAG architecture:
|
|
- Personal Layer: scope=personal, userId-scoped
|
|
- Instance Layer: scope=featureInstance, featureInstanceId-scoped
|
|
- Mandate Layer: scope=mandate, mandateId-scoped (visible to all mandate users)
|
|
- Global Layer: scope=global (sysAdmin only)
|
|
- Workflow Layer: workflowId-scoped (WorkflowMemory)
|
|
|
|
Vector fields use json_schema_extra={"db_type": "vector(1536)"} for pgvector.
|
|
"""
|
|
|
|
from typing import Dict, Any, List, Optional
|
|
from pydantic import BaseModel, Field
|
|
from modules.datamodels.datamodelBase import PowerOnModel
|
|
from modules.shared.i18nRegistry import i18nModel
|
|
from modules.shared.timeUtils import getUtcTimestamp
|
|
import uuid
|
|
|
|
|
|
@i18nModel("Datei-Inhaltsindex")
|
|
class FileContentIndex(PowerOnModel):
|
|
"""Struktureller Index der Inhaltsobjekte einer Datei."""
|
|
id: str = Field(
|
|
default_factory=lambda: str(uuid.uuid4()),
|
|
description="Primary key (typically = fileId)",
|
|
json_schema_extra={"label": "ID"},
|
|
)
|
|
userId: str = Field(
|
|
description="Owner user ID",
|
|
json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "UserInDB", "labelField": "username"}},
|
|
)
|
|
featureInstanceId: str = Field(
|
|
default="",
|
|
description="Feature instance scope",
|
|
json_schema_extra={"label": "Feature-Instanz-ID", "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}},
|
|
)
|
|
mandateId: str = Field(
|
|
default="",
|
|
description="Mandate scope",
|
|
json_schema_extra={"label": "Mandanten-ID", "fk_target": {"db": "poweron_app", "table": "Mandate", "labelField": "label"}},
|
|
)
|
|
fileName: str = Field(
|
|
description="Original file name",
|
|
json_schema_extra={"label": "Dateiname"},
|
|
)
|
|
mimeType: str = Field(
|
|
description="MIME type of the file",
|
|
json_schema_extra={"label": "MIME-Typ"},
|
|
)
|
|
containerPath: Optional[str] = Field(
|
|
default=None,
|
|
description="Path within a container (e.g. 'archive.zip/folder/report.pdf')",
|
|
json_schema_extra={"label": "Container-Pfad"},
|
|
)
|
|
totalObjects: int = Field(
|
|
default=0,
|
|
description="Total number of content objects extracted",
|
|
json_schema_extra={"label": "Anzahl Objekte"},
|
|
)
|
|
totalSize: int = Field(
|
|
default=0,
|
|
description="Total size of all content objects in bytes",
|
|
json_schema_extra={"label": "Gesamtgroesse"},
|
|
)
|
|
structure: Dict[str, Any] = Field(
|
|
default_factory=dict,
|
|
description="Structural overview (pages, sections, hierarchy)",
|
|
json_schema_extra={"label": "Struktur"},
|
|
)
|
|
objectSummary: List[Dict[str, Any]] = Field(
|
|
default_factory=list,
|
|
description="Compact summary per content object",
|
|
json_schema_extra={"label": "Objekt-Zusammenfassung"},
|
|
)
|
|
extractedAt: float = Field(
|
|
default_factory=getUtcTimestamp,
|
|
description="Extraction timestamp",
|
|
json_schema_extra={"label": "Extrahiert am", "frontend_type": "timestamp"},
|
|
)
|
|
status: str = Field(
|
|
default="pending",
|
|
description="Processing status: pending, extracted, embedding, indexed, failed",
|
|
json_schema_extra={"label": "Status"},
|
|
)
|
|
scope: str = Field(
|
|
default="personal",
|
|
description="Data visibility scope: personal, featureInstance, mandate, global",
|
|
json_schema_extra={"label": "Sichtbarkeit"},
|
|
)
|
|
neutralizationStatus: Optional[str] = Field(
|
|
default=None,
|
|
description="Neutralization status: completed, failed, skipped, None = not required",
|
|
json_schema_extra={"label": "Neutralisierungsstatus"},
|
|
)
|
|
isNeutralized: bool = Field(
|
|
default=False,
|
|
description="True if content was neutralized before indexing",
|
|
json_schema_extra={"label": "Neutralisiert"},
|
|
)
|
|
|
|
|
|
@i18nModel("Inhalts-Chunk")
|
|
class ContentChunk(PowerOnModel):
|
|
"""Persistierter Inhalts-Chunk mit Embedding-Vektor."""
|
|
id: str = Field(
|
|
default_factory=lambda: str(uuid.uuid4()),
|
|
description="Primary key",
|
|
json_schema_extra={"label": "ID"},
|
|
)
|
|
contentObjectId: str = Field(
|
|
description="Reference to the content object within FileContentIndex",
|
|
json_schema_extra={"label": "Inhaltsobjekt-ID"},
|
|
)
|
|
fileId: str = Field(
|
|
description="FK to the source file",
|
|
json_schema_extra={"label": "Datei-ID", "fk_target": {"db": "poweron_management", "table": "FileItem", "labelField": "fileName"}},
|
|
)
|
|
userId: str = Field(
|
|
description="Owner user ID",
|
|
json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "UserInDB", "labelField": "username"}},
|
|
)
|
|
featureInstanceId: str = Field(
|
|
default="",
|
|
description="Feature instance scope",
|
|
json_schema_extra={"label": "Feature-Instanz-ID", "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}},
|
|
)
|
|
contentType: str = Field(
|
|
description="Content type: text, image, videostream, audiostream, other",
|
|
json_schema_extra={"label": "Inhaltstyp"},
|
|
)
|
|
data: str = Field(
|
|
description="Content data (text, base64, URL)",
|
|
json_schema_extra={"label": "Daten"},
|
|
)
|
|
contextRef: Dict[str, Any] = Field(
|
|
default_factory=dict,
|
|
description="Context reference (page, position, label)",
|
|
json_schema_extra={"label": "Kontext-Referenz"},
|
|
)
|
|
summary: Optional[str] = Field(
|
|
default=None,
|
|
description="AI-generated summary (on demand)",
|
|
json_schema_extra={"label": "Zusammenfassung"},
|
|
)
|
|
chunkMetadata: Dict[str, Any] = Field(
|
|
default_factory=dict,
|
|
description="Additional metadata",
|
|
json_schema_extra={"label": "Metadaten"},
|
|
)
|
|
embedding: Optional[List[float]] = Field(
|
|
default=None,
|
|
description="pgvector embedding (NOT NULL for text chunks)",
|
|
json_schema_extra={"label": "Embedding", "db_type": "vector(1536)"},
|
|
)
|
|
|
|
|
|
@i18nModel("Runden-Speicher")
|
|
class RoundMemory(PowerOnModel):
|
|
"""Persistenter Speicher pro Agenten-Runde."""
|
|
id: str = Field(
|
|
default_factory=lambda: str(uuid.uuid4()),
|
|
description="Primary key",
|
|
json_schema_extra={"label": "ID"},
|
|
)
|
|
workflowId: str = Field(
|
|
description="FK to the workflow",
|
|
json_schema_extra={"label": "Workflow-ID"},
|
|
)
|
|
roundNumber: int = Field(
|
|
default=0,
|
|
description="Agent round that produced this memory",
|
|
json_schema_extra={"label": "Rundennummer"},
|
|
)
|
|
memoryType: str = Field(
|
|
description="Category: file_ref, tool_result, decision, data_source_ref",
|
|
json_schema_extra={"label": "Speichertyp"},
|
|
)
|
|
key: str = Field(
|
|
description="Dedup key, e.g. 'readFile:<fileId>' or 'plan'",
|
|
json_schema_extra={"label": "Schluessel"},
|
|
)
|
|
summary: str = Field(
|
|
default="",
|
|
description="Compact summary (max ~2000 chars)",
|
|
json_schema_extra={"label": "Zusammenfassung"},
|
|
)
|
|
fullData: Optional[str] = Field(
|
|
default=None,
|
|
description="Full tool output when small enough (max ~8000 chars)",
|
|
json_schema_extra={"label": "Volldaten"},
|
|
)
|
|
fileIds: List[str] = Field(
|
|
default_factory=list,
|
|
description="Referenced file IDs",
|
|
json_schema_extra={"label": "Datei-IDs"},
|
|
)
|
|
embedding: Optional[List[float]] = Field(
|
|
default=None,
|
|
description="Embedding of summary for semantic retrieval",
|
|
json_schema_extra={"label": "Embedding", "db_type": "vector(1536)"},
|
|
)
|
|
|
|
|
|
@i18nModel("Workflow-Speicher")
|
|
class WorkflowMemory(PowerOnModel):
|
|
"""Workflow-spezifischer Key-Value-Cache fuer Entitaeten und Fakten."""
|
|
id: str = Field(
|
|
default_factory=lambda: str(uuid.uuid4()),
|
|
description="Primary key",
|
|
json_schema_extra={"label": "ID"},
|
|
)
|
|
workflowId: str = Field(
|
|
description="FK to the workflow",
|
|
json_schema_extra={"label": "Workflow-ID", "fk_target": {"db": "poweron_chat", "table": "ChatWorkflow", "labelField": "name"}},
|
|
)
|
|
userId: str = Field(
|
|
description="Owner user ID",
|
|
json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "UserInDB", "labelField": "username"}},
|
|
)
|
|
featureInstanceId: str = Field(
|
|
default="",
|
|
description="Feature instance scope",
|
|
json_schema_extra={"label": "Feature-Instanz-ID", "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}},
|
|
)
|
|
key: str = Field(
|
|
description="Key identifier (e.g. 'entity:companyName')",
|
|
json_schema_extra={"label": "Schluessel"},
|
|
)
|
|
value: str = Field(
|
|
description="Extracted value",
|
|
json_schema_extra={"label": "Wert"},
|
|
)
|
|
source: str = Field(
|
|
default="extraction",
|
|
description="Origin: extraction, tool, conversation, summary",
|
|
json_schema_extra={"label": "Quelle"},
|
|
)
|
|
embedding: Optional[List[float]] = Field(
|
|
default=None,
|
|
description="Optional embedding for semantic lookup",
|
|
json_schema_extra={"label": "Embedding", "db_type": "vector(1536)"},
|
|
)
|