214 lines
9.8 KiB
Python
214 lines
9.8 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Document Generation Path
|
|
|
|
Handles document generation using existing chapter/section model.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import time
|
|
import copy
|
|
from typing import Dict, Any, List, Optional
|
|
from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, DocumentData
|
|
from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent
|
|
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
|
|
from modules.datamodels.datamodelDocument import RenderedDocument
|
|
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DocumentGenerationPath:
|
|
"""Document generation path (existing functionality, refactored)."""
|
|
|
|
def __init__(self, services):
|
|
self.services = services
|
|
|
|
async def generateDocument(
|
|
self,
|
|
userPrompt: str,
|
|
documentList: Optional[Any] = None, # DocumentReferenceList
|
|
documentIntents: Optional[List[DocumentIntent]] = None,
|
|
contentParts: Optional[List[ContentPart]] = None,
|
|
outputFormat: str = "txt",
|
|
title: Optional[str] = None,
|
|
parentOperationId: Optional[str] = None
|
|
) -> AiResponse:
|
|
"""
|
|
Generate document using existing chapter/section model.
|
|
|
|
Returns: AiResponse with documents list
|
|
"""
|
|
# Create operation ID
|
|
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
|
docOperationId = f"doc_gen_{workflowId}_{int(time.time())}"
|
|
|
|
# Start progress tracking
|
|
self.services.chat.progressLogStart(
|
|
docOperationId,
|
|
"Document Generation",
|
|
"Document Generation",
|
|
f"Format: {outputFormat}",
|
|
parentOperationId=parentOperationId
|
|
)
|
|
|
|
try:
|
|
# Schritt 5A: Kläre Dokument-Intents
|
|
doc_list = []
|
|
if documentList:
|
|
doc_list = self.services.chat.getChatDocumentsFromDocumentList(documentList)
|
|
|
|
# Filter: Entferne Original-Dokumente, wenn bereits Pre-Extracted JSONs existieren
|
|
# (um Duplikate zu vermeiden - Pre-Extracted JSONs enthalten bereits die ContentParts)
|
|
# Schritt 1: Identifiziere alle Original-Dokument-IDs, die durch Pre-Extracted JSONs abgedeckt werden
|
|
originalDocIdsCoveredByPreExtracted = set()
|
|
for doc in doc_list:
|
|
preExtracted = self.services.ai.intentAnalyzer.resolvePreExtractedDocument(doc)
|
|
if preExtracted:
|
|
originalDocId = preExtracted["originalDocument"]["id"]
|
|
originalDocIdsCoveredByPreExtracted.add(originalDocId)
|
|
logger.debug(f"Found pre-extracted JSON {doc.id} covering original document {originalDocId}")
|
|
|
|
# Schritt 2: Filtere Dokumente - entferne Original-Dokumente, die bereits durch Pre-Extracted JSONs abgedeckt werden
|
|
filteredDocuments = []
|
|
for doc in doc_list:
|
|
preExtracted = self.services.ai.intentAnalyzer.resolvePreExtractedDocument(doc)
|
|
if preExtracted:
|
|
# Pre-Extracted JSON behalten
|
|
filteredDocuments.append(doc)
|
|
elif doc.id in originalDocIdsCoveredByPreExtracted:
|
|
# Original-Dokument, das bereits durch Pre-Extracted JSON abgedeckt wird - entfernen
|
|
logger.info(f"Skipping original document {doc.id} ({doc.fileName}) - already covered by pre-extracted JSON")
|
|
else:
|
|
# Normales Dokument ohne Pre-Extracted JSON - behalten
|
|
filteredDocuments.append(doc)
|
|
|
|
doc_list = filteredDocuments
|
|
|
|
checkWorkflowStopped(self.services)
|
|
|
|
if not documentIntents and doc_list:
|
|
documentIntents = await self.services.ai.clarifyDocumentIntents(
|
|
doc_list,
|
|
userPrompt,
|
|
{"outputFormat": outputFormat},
|
|
docOperationId
|
|
)
|
|
|
|
checkWorkflowStopped(self.services)
|
|
|
|
# Schritt 5B: Extrahiere und bereite Content vor
|
|
if doc_list:
|
|
preparedContentParts = await self.services.ai.extractAndPrepareContent(
|
|
doc_list,
|
|
documentIntents or [],
|
|
docOperationId
|
|
)
|
|
|
|
# Merge mit bereitgestellten contentParts (falls vorhanden)
|
|
if contentParts:
|
|
# Prüfe auf pre-extracted Content
|
|
for part in contentParts:
|
|
if part.metadata.get("skipExtraction", False):
|
|
# Bereits extrahiert - verwende as-is, stelle sicher dass Metadaten vollständig
|
|
part.metadata.setdefault("contentFormat", "extracted")
|
|
part.metadata.setdefault("isPreExtracted", True)
|
|
preparedContentParts.extend(contentParts)
|
|
|
|
contentParts = preparedContentParts
|
|
|
|
# Schritt 5B.5: Documents are converted to contentParts (like pre-processed JSON files)
|
|
# No AI extraction here - AI extraction happens during section generation
|
|
if contentParts:
|
|
logger.info(f"Using {len(contentParts)} content parts for generation (no AI extraction at this stage)")
|
|
|
|
checkWorkflowStopped(self.services)
|
|
|
|
# Schritt 5C: Generiere Struktur
|
|
structure = await self.services.ai.generateStructure(
|
|
userPrompt,
|
|
contentParts or [],
|
|
outputFormat,
|
|
docOperationId
|
|
)
|
|
|
|
checkWorkflowStopped(self.services)
|
|
|
|
# Schritt 5D: Fülle Struktur
|
|
# Language will be extracted from services (user intention analysis) in fillStructure
|
|
filledStructure = await self.services.ai.fillStructure(
|
|
structure,
|
|
contentParts or [],
|
|
userPrompt,
|
|
docOperationId
|
|
)
|
|
|
|
checkWorkflowStopped(self.services)
|
|
|
|
# Schritt 5E: Rendere Resultat
|
|
# Jedes Dokument wird einzeln gerendert, kann 1..n Dateien zurückgeben (z.B. HTML + Bilder)
|
|
# Language is already validated in structure (State 3) and preserved in filled structure (State 4)
|
|
# Per-document language will be extracted in renderReport() from filledStructure
|
|
# Use validated currentUserLanguage as global fallback (always valid infrastructure)
|
|
language = self.services.currentUserLanguage if hasattr(self.services, 'currentUserLanguage') and self.services.currentUserLanguage else "en"
|
|
|
|
# IMPORTANT: Create deep copy BEFORE renderResult to preserve filledStructure with elements
|
|
# renderResult might modify the structure, so we need to preserve the original for sourceJson
|
|
# This ensures sourceJson contains the complete structure with elements for validation
|
|
filledStructureForSourceJson = copy.deepcopy(filledStructure) if filledStructure else None
|
|
|
|
renderedDocuments = await self.services.ai.renderResult(
|
|
filledStructure,
|
|
outputFormat,
|
|
language, # Global fallback (per-document language extracted from structure in renderReport)
|
|
title or "Generated Document",
|
|
userPrompt,
|
|
docOperationId
|
|
)
|
|
|
|
# Baue Response: Konvertiere alle gerenderten Dokumente zu DocumentData
|
|
documentDataList = []
|
|
for renderedDoc in renderedDocuments:
|
|
try:
|
|
# Erstelle DocumentData für jedes gerenderte Dokument
|
|
# Use the preserved filledStructureForSourceJson (with elements) for sourceJson
|
|
docDataObj = DocumentData(
|
|
documentName=renderedDoc.filename,
|
|
documentData=renderedDoc.documentData,
|
|
mimeType=renderedDoc.mimeType,
|
|
sourceJson=filledStructureForSourceJson if len(documentDataList) == 0 else None # Nur für erstes Dokument
|
|
)
|
|
documentDataList.append(docDataObj)
|
|
logger.debug(f"Added rendered document: {renderedDoc.filename} ({len(renderedDoc.documentData)} bytes, {renderedDoc.mimeType})")
|
|
except Exception as e:
|
|
logger.warning(f"Error creating document {renderedDoc.filename}: {str(e)}")
|
|
|
|
if not documentDataList:
|
|
raise ValueError("No documents were rendered")
|
|
|
|
metadata = AiResponseMetadata(
|
|
title=title or filledStructure.get("metadata", {}).get("title", "Generated Document"),
|
|
operationType=OperationTypeEnum.DATA_GENERATE.value
|
|
)
|
|
|
|
# Debug-Log (harmonisiert)
|
|
self.services.utils.writeDebugFile(
|
|
json.dumps(filledStructure, indent=2, ensure_ascii=False, default=str),
|
|
"document_generation_response"
|
|
)
|
|
|
|
self.services.chat.progressLogFinish(docOperationId, True)
|
|
|
|
return AiResponse(
|
|
content=json.dumps(filledStructure),
|
|
metadata=metadata,
|
|
documents=documentDataList
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in document generation: {str(e)}")
|
|
self.services.chat.progressLogFinish(docOperationId, False)
|
|
raise
|
|
|