gateway/modules/services/serviceGeneration/paths/documentPath.py

214 lines
9.8 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Document Generation Path
Handles document generation using existing chapter/section model.
"""
import json
import logging
import time
import copy
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, DocumentData
from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
from modules.datamodels.datamodelDocument import RenderedDocument
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
logger = logging.getLogger(__name__)
class DocumentGenerationPath:
"""Document generation path (existing functionality, refactored)."""
def __init__(self, services):
self.services = services
async def generateDocument(
self,
userPrompt: str,
documentList: Optional[Any] = None, # DocumentReferenceList
documentIntents: Optional[List[DocumentIntent]] = None,
contentParts: Optional[List[ContentPart]] = None,
outputFormat: str = "txt",
title: Optional[str] = None,
parentOperationId: Optional[str] = None
) -> AiResponse:
"""
Generate document using existing chapter/section model.
Returns: AiResponse with documents list
"""
# Create operation ID
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
docOperationId = f"doc_gen_{workflowId}_{int(time.time())}"
# Start progress tracking
self.services.chat.progressLogStart(
docOperationId,
"Document Generation",
"Document Generation",
f"Format: {outputFormat}",
parentOperationId=parentOperationId
)
try:
# Schritt 5A: Kläre Dokument-Intents
documents = []
if documentList:
documents = self.services.chat.getChatDocumentsFromDocumentList(documentList)
# Filter: Entferne Original-Dokumente, wenn bereits Pre-Extracted JSONs existieren
# (um Duplikate zu vermeiden - Pre-Extracted JSONs enthalten bereits die ContentParts)
# Schritt 1: Identifiziere alle Original-Dokument-IDs, die durch Pre-Extracted JSONs abgedeckt werden
originalDocIdsCoveredByPreExtracted = set()
for doc in documents:
preExtracted = self.services.ai.intentAnalyzer.resolvePreExtractedDocument(doc)
if preExtracted:
originalDocId = preExtracted["originalDocument"]["id"]
originalDocIdsCoveredByPreExtracted.add(originalDocId)
logger.debug(f"Found pre-extracted JSON {doc.id} covering original document {originalDocId}")
# Schritt 2: Filtere Dokumente - entferne Original-Dokumente, die bereits durch Pre-Extracted JSONs abgedeckt werden
filteredDocuments = []
for doc in documents:
preExtracted = self.services.ai.intentAnalyzer.resolvePreExtractedDocument(doc)
if preExtracted:
# Pre-Extracted JSON behalten
filteredDocuments.append(doc)
elif doc.id in originalDocIdsCoveredByPreExtracted:
# Original-Dokument, das bereits durch Pre-Extracted JSON abgedeckt wird - entfernen
logger.info(f"Skipping original document {doc.id} ({doc.fileName}) - already covered by pre-extracted JSON")
else:
# Normales Dokument ohne Pre-Extracted JSON - behalten
filteredDocuments.append(doc)
documents = filteredDocuments
checkWorkflowStopped(self.services)
if not documentIntents and documents:
documentIntents = await self.services.ai.clarifyDocumentIntents(
documents,
userPrompt,
{"outputFormat": outputFormat},
docOperationId
)
checkWorkflowStopped(self.services)
# Schritt 5B: Extrahiere und bereite Content vor
if documents:
preparedContentParts = await self.services.ai.extractAndPrepareContent(
documents,
documentIntents or [],
docOperationId
)
# Merge mit bereitgestellten contentParts (falls vorhanden)
if contentParts:
# Prüfe auf pre-extracted Content
for part in contentParts:
if part.metadata.get("skipExtraction", False):
# Bereits extrahiert - verwende as-is, stelle sicher dass Metadaten vollständig
part.metadata.setdefault("contentFormat", "extracted")
part.metadata.setdefault("isPreExtracted", True)
preparedContentParts.extend(contentParts)
contentParts = preparedContentParts
# Schritt 5B.5: Documents are converted to contentParts (like pre-processed JSON files)
# No AI extraction here - AI extraction happens during section generation
if contentParts:
logger.info(f"Using {len(contentParts)} content parts for generation (no AI extraction at this stage)")
checkWorkflowStopped(self.services)
# Schritt 5C: Generiere Struktur
structure = await self.services.ai.generateStructure(
userPrompt,
contentParts or [],
outputFormat,
docOperationId
)
checkWorkflowStopped(self.services)
# Schritt 5D: Fülle Struktur
# Language will be extracted from services (user intention analysis) in fillStructure
filledStructure = await self.services.ai.fillStructure(
structure,
contentParts or [],
userPrompt,
docOperationId
)
checkWorkflowStopped(self.services)
# Schritt 5E: Rendere Resultat
# Jedes Dokument wird einzeln gerendert, kann 1..n Dateien zurückgeben (z.B. HTML + Bilder)
# Language is already validated in structure (State 3) and preserved in filled structure (State 4)
# Per-document language will be extracted in renderReport() from filledStructure
# Use validated currentUserLanguage as global fallback (always valid infrastructure)
language = self.services.currentUserLanguage if hasattr(self.services, 'currentUserLanguage') and self.services.currentUserLanguage else "en"
# IMPORTANT: Create deep copy BEFORE renderResult to preserve filledStructure with elements
# renderResult might modify the structure, so we need to preserve the original for sourceJson
# This ensures sourceJson contains the complete structure with elements for validation
filledStructureForSourceJson = copy.deepcopy(filledStructure) if filledStructure else None
renderedDocuments = await self.services.ai.renderResult(
filledStructure,
outputFormat,
language, # Global fallback (per-document language extracted from structure in renderReport)
title or "Generated Document",
userPrompt,
docOperationId
)
# Baue Response: Konvertiere alle gerenderten Dokumente zu DocumentData
documentDataList = []
for renderedDoc in renderedDocuments:
try:
# Erstelle DocumentData für jedes gerenderte Dokument
# Use the preserved filledStructureForSourceJson (with elements) for sourceJson
docDataObj = DocumentData(
documentName=renderedDoc.filename,
documentData=renderedDoc.documentData,
mimeType=renderedDoc.mimeType,
sourceJson=filledStructureForSourceJson if len(documentDataList) == 0 else None # Nur für erstes Dokument
)
documentDataList.append(docDataObj)
logger.debug(f"Added rendered document: {renderedDoc.filename} ({len(renderedDoc.documentData)} bytes, {renderedDoc.mimeType})")
except Exception as e:
logger.warning(f"Error creating document {renderedDoc.filename}: {str(e)}")
if not documentDataList:
raise ValueError("No documents were rendered")
metadata = AiResponseMetadata(
title=title or filledStructure.get("metadata", {}).get("title", "Generated Document"),
operationType=OperationTypeEnum.DATA_GENERATE.value
)
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
json.dumps(filledStructure, indent=2, ensure_ascii=False, default=str),
"document_generation_response"
)
self.services.chat.progressLogFinish(docOperationId, True)
return AiResponse(
content=json.dumps(filledStructure),
metadata=metadata,
documents=documentDataList
)
except Exception as e:
logger.error(f"Error in document generation: {str(e)}")
self.services.chat.progressLogFinish(docOperationId, False)
raise