# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Structure Filling Module Handles filling document structure with content, including: - Filling sections with content parts - Building section generation prompts - Aggregation logic """ import json import logging import copy from typing import Dict, Any, List, Optional from modules.datamodels.datamodelExtraction import ContentPart from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum logger = logging.getLogger(__name__) class StructureFiller: """Handles filling document structure with content.""" def __init__(self, services, aiService): """Initialize StructureFiller with service center and AI service access.""" self.services = services self.aiService = aiService async def fillStructure( self, structure: Dict[str, Any], contentParts: List[ContentPart], userPrompt: str, parentOperationId: str ) -> Dict[str, Any]: """ Phase 5D: Chapter-Content-Generierung (Zwei-Phasen-Ansatz). Phase 5D.1: Generiert Sections-Struktur für jedes Chapter Phase 5D.2: Füllt Sections mit ContentParts Args: structure: Struktur-Dict mit documents und chapters (nicht sections!) contentParts: Alle vorbereiteten ContentParts userPrompt: User-Anfrage parentOperationId: Parent Operation-ID für ChatLog-Hierarchie Returns: Gefüllte Struktur mit elements in jeder Section (nach Flattening) """ # Erstelle Operation-ID für Struktur-Abfüllen fillOperationId = f"{parentOperationId}_structure_filling" # Prüfe ob Struktur Chapters oder Sections hat hasChapters = False for doc in structure.get("documents", []): if "chapters" in doc: hasChapters = True break if not hasChapters: # Fallback: Alte Struktur mit Sections direkt - verwende alte Logik logger.warning("Structure has no chapters, using legacy section-based filling") return await self._fillStructureLegacy(structure, contentParts, userPrompt, fillOperationId) # Starte ChatLog mit Parent-Referenz chapterCount = sum(len(doc.get("chapters", [])) for doc in structure.get("documents", [])) self.services.chat.progressLogStart( fillOperationId, "Chapter Content Generation", "Filling", f"Processing {chapterCount} chapters", parentOperationId=parentOperationId ) try: filledStructure = copy.deepcopy(structure) # Phase 5D.1: Sections-Struktur für jedes Chapter generieren filledStructure = await self._generateChapterSectionsStructure( filledStructure, contentParts, userPrompt, fillOperationId ) # Phase 5D.2: Sections mit ContentParts füllen filledStructure = await self._fillChapterSections( filledStructure, contentParts, userPrompt, fillOperationId ) # Flattening: Chapters zu Sections konvertieren flattenedStructure = self._flattenChaptersToSections(filledStructure) # Füge ContentParts-Metadaten zur Struktur hinzu (für Validierung) flattenedStructure = self._addContentPartsMetadata(flattenedStructure, contentParts) # ChatLog abschließen self.services.chat.progressLogFinish(fillOperationId, True) return flattenedStructure except Exception as e: self.services.chat.progressLogFinish(fillOperationId, False) logger.error(f"Error in fillStructure: {str(e)}") raise async def _generateChapterSectionsStructure( self, chapterStructure: Dict[str, Any], contentParts: List[ContentPart], userPrompt: str, parentOperationId: str ) -> Dict[str, Any]: """ Phase 5D.1: Generiert Sections-Struktur für jedes Chapter (ohne Content). Sections enthalten: content_type, contentPartIds, generationHint, useAiCall """ for doc in chapterStructure.get("documents", []): for chapter in doc.get("chapters", []): chapterId = chapter.get("id", "unknown") chapterLevel = chapter.get("level", 1) chapterTitle = chapter.get("title", "") generationHint = chapter.get("generationHint", "") contentPartIds = chapter.get("contentPartIds", []) contentPartInstructions = chapter.get("contentPartInstructions", {}) chapterPrompt = self._buildChapterSectionsStructurePrompt( chapterId=chapterId, chapterLevel=chapterLevel, chapterTitle=chapterTitle, generationHint=generationHint, contentPartIds=contentPartIds, contentPartInstructions=contentPartInstructions, contentParts=contentParts, userPrompt=userPrompt ) # Debug: Log Prompt self.services.utils.writeDebugFile( chapterPrompt, f"chapter_structure_{chapterId}_prompt" ) aiResponse = await self.aiService.callAiPlanning( prompt=chapterPrompt, debugType=f"chapter_structure_{chapterId}" ) # Debug: Log Response self.services.utils.writeDebugFile( aiResponse, f"chapter_structure_{chapterId}_response" ) sectionsStructure = json.loads( self.services.utils.jsonExtractString(aiResponse) ) chapter["sections"] = sectionsStructure.get("sections", []) # Setze useAiCall Flag (falls nicht von AI gesetzt) for section in chapter["sections"]: if "useAiCall" not in section: contentType = section.get("content_type", "paragraph") useAiCall = contentType != "paragraph" # Prüfe contentPartInstructions if not useAiCall: for partId in section.get("contentPartIds", []): instruction = contentPartInstructions.get(partId, {}).get("instruction", "") if instruction and instruction.lower() not in ["include full text", "include all content", "use full extracted text"]: useAiCall = True break section["useAiCall"] = useAiCall return chapterStructure async def _fillChapterSections( self, chapterStructure: Dict[str, Any], contentParts: List[ContentPart], userPrompt: str, parentOperationId: str ) -> Dict[str, Any]: """ Phase 5D.2: Füllt Sections mit ContentParts. """ # Sammle alle Sections für sequenzielle Verarbeitung sections_to_process = [] all_sections_list = [] # Für Kontext-Informationen for doc in chapterStructure.get("documents", []): for chapter in doc.get("chapters", []): for section in chapter.get("sections", []): all_sections_list.append(section) sections_to_process.append((doc, chapter, section)) # Sequenzielle Section-Generierung fillOperationId = parentOperationId for sectionIndex, (doc, chapter, section) in enumerate(sections_to_process): sectionId = section.get("id") contentPartIds = section.get("contentPartIds", []) contentFormats = section.get("contentFormats", {}) generationHint = section.get("generation_hint") contentType = section.get("content_type", "paragraph") useAiCall = section.get("useAiCall", False) elements = [] # Prüfe ob Aggregation nötig ist needsAggregation = self._needsAggregation( contentType=contentType, contentPartCount=len(contentPartIds) ) if needsAggregation and useAiCall: # Aggregation: Alle Parts zusammen verarbeiten sectionParts = [ self._findContentPartById(pid, contentParts) for pid in contentPartIds ] sectionParts = [p for p in sectionParts if p is not None] if sectionParts: # Filtere nur extracted Parts für Aggregation (reference/object werden separat behandelt) extractedParts = [ p for p in sectionParts if contentFormats.get(p.id, p.metadata.get("contentFormat")) == "extracted" ] nonExtractedParts = [ p for p in sectionParts if contentFormats.get(p.id, p.metadata.get("contentFormat")) != "extracted" ] # Verarbeite non-extracted Parts separat (reference, object) for part in nonExtractedParts: contentFormat = contentFormats.get(part.id, part.metadata.get("contentFormat")) if contentFormat == "reference": elements.append({ "type": "reference", "documentReference": part.metadata.get("documentReference"), "label": part.metadata.get("usageHint", part.label) }) elif contentFormat == "object": elements.append({ "type": part.typeGroup, "base64Data": part.data, "mimeType": part.mimeType, "altText": part.metadata.get("usageHint", part.label) }) # Aggregiere extracted Parts mit AI if extractedParts: generationPrompt = self._buildSectionGenerationPrompt( section=section, contentParts=extractedParts, # ALLE PARTS für Aggregation! userPrompt=userPrompt, generationHint=generationHint, allSections=all_sections_list, sectionIndex=sectionIndex, isAggregation=True ) # Erstelle Operation-ID für Section-Generierung sectionOperationId = f"{fillOperationId}_section_{sectionId}" # Starte ChatLog mit Parent-Referenz self.services.chat.progressLogStart( sectionOperationId, "Section Generation (Aggregation)", "Section", f"Generating section {sectionId} with {len(extractedParts)} parts", parentOperationId=fillOperationId ) try: # Debug: Log Prompt self.services.utils.writeDebugFile( generationPrompt, f"section_content_{sectionId}_prompt" ) # Verwende callAi für ContentParts-Unterstützung (nicht callAiPlanning!) request = AiCallRequest( prompt=generationPrompt, contentParts=extractedParts, # ALLE PARTS! options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.BALANCED, processingMode=ProcessingModeEnum.DETAILED ) ) aiResponse = await self.aiService.callAi(request) # Debug: Log Response self.services.utils.writeDebugFile( aiResponse.content, f"section_content_{sectionId}_response" ) # Parse und füge zu elements hinzu generatedElements = json.loads( self.services.utils.jsonExtractString(aiResponse.content) ) if isinstance(generatedElements, list): elements.extend(generatedElements) elif isinstance(generatedElements, dict) and "elements" in generatedElements: elements.extend(generatedElements["elements"]) # ChatLog abschließen self.services.chat.progressLogFinish(sectionOperationId, True) except Exception as e: # Fehlerhafte Section mit Fehlermeldung rendern (kein Abbruch!) self.services.chat.progressLogFinish(sectionOperationId, False) elements.append({ "type": "error", "message": f"Error generating section {sectionId}: {str(e)}", "sectionId": sectionId }) logger.error(f"Error generating section {sectionId}: {str(e)}") # NICHT raise - Section wird mit Fehlermeldung gerendert else: # Einzelverarbeitung: Jeder Part einzeln for partId in contentPartIds: part = self._findContentPartById(partId, contentParts) if not part: continue contentFormat = contentFormats.get(partId, part.metadata.get("contentFormat")) if contentFormat == "reference": # Füge Dokument-Referenz hinzu elements.append({ "type": "reference", "documentReference": part.metadata.get("documentReference"), "label": part.metadata.get("usageHint", part.label) }) elif contentFormat == "object": # Füge base64 Object hinzu elements.append({ "type": part.typeGroup, # "image", "binary", etc. "base64Data": part.data, "mimeType": part.mimeType, "altText": part.metadata.get("usageHint", part.label) }) elif contentFormat == "extracted": if generationHint: # AI-Call mit einzelnen ContentPart generationPrompt = self._buildSectionGenerationPrompt( section=section, contentParts=[part], # EIN PART userPrompt=userPrompt, generationHint=generationHint, allSections=all_sections_list, sectionIndex=sectionIndex, isAggregation=False ) # Erstelle Operation-ID für Section-Generierung sectionOperationId = f"{fillOperationId}_section_{sectionId}" # Starte ChatLog mit Parent-Referenz self.services.chat.progressLogStart( sectionOperationId, "Section Generation", "Section", f"Generating section {sectionId}", parentOperationId=fillOperationId ) try: # Debug: Log Prompt self.services.utils.writeDebugFile( generationPrompt, f"section_content_{sectionId}_prompt" ) # Verwende callAi für ContentParts-Unterstützung request = AiCallRequest( prompt=generationPrompt, contentParts=[part], options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.BALANCED, processingMode=ProcessingModeEnum.DETAILED ) ) aiResponse = await self.aiService.callAi(request) # Debug: Log Response self.services.utils.writeDebugFile( aiResponse.content, f"section_content_{sectionId}_response" ) # Parse und füge zu elements hinzu generatedElements = json.loads( self.services.utils.jsonExtractString(aiResponse.content) ) if isinstance(generatedElements, list): elements.extend(generatedElements) elif isinstance(generatedElements, dict) and "elements" in generatedElements: elements.extend(generatedElements["elements"]) # ChatLog abschließen self.services.chat.progressLogFinish(sectionOperationId, True) except Exception as e: # Fehlerhafte Section mit Fehlermeldung rendern (kein Abbruch!) self.services.chat.progressLogFinish(sectionOperationId, False) elements.append({ "type": "error", "message": f"Error generating section {sectionId}: {str(e)}", "sectionId": sectionId }) logger.error(f"Error generating section {sectionId}: {str(e)}") # NICHT raise - Section wird mit Fehlermeldung gerendert else: # Füge extrahierten Text direkt hinzu (kein AI-Call) elements.append({ "type": "extracted_text", "content": part.data, "source": part.metadata.get("documentId"), "extractionPrompt": part.metadata.get("extractionPrompt") }) section["elements"] = elements return chapterStructure def _addContentPartsMetadata( self, structure: Dict[str, Any], contentParts: List[ContentPart] ) -> Dict[str, Any]: """ Fügt ContentParts-Metadaten zur Struktur hinzu, wenn contentPartIds vorhanden sind. Dies hilft der Validierung, den Kontext der ContentParts zu verstehen. """ # Erstelle Mapping von ContentPart-ID zu Metadaten contentPartsMap = {} for part in contentParts: contentPartsMap[part.id] = { "id": part.id, "format": part.metadata.get("contentFormat", "unknown"), "type": part.typeGroup, "mimeType": part.mimeType, "originalFileName": part.metadata.get("originalFileName"), "usageHint": part.metadata.get("usageHint"), "documentId": part.metadata.get("documentId"), "dataSize": len(str(part.data)) if part.data else 0 } # Füge Metadaten zu Sections hinzu, die contentPartIds haben for doc in structure.get("documents", []): # Prüfe ob Chapters vorhanden sind (neue Struktur) if "chapters" in doc: for chapter in doc.get("chapters", []): # Füge Metadaten zu Chapter-Level contentPartIds hinzu chapterContentPartIds = chapter.get("contentPartIds", []) if chapterContentPartIds: chapter["contentPartsMetadata"] = [] for partId in chapterContentPartIds: if partId in contentPartsMap: chapter["contentPartsMetadata"].append(contentPartsMap[partId]) # Füge Metadaten zu Sections hinzu for section in chapter.get("sections", []): contentPartIds = section.get("contentPartIds", []) if contentPartIds: section["contentPartsMetadata"] = [] for partId in contentPartIds: if partId in contentPartsMap: section["contentPartsMetadata"].append(contentPartsMap[partId]) # Prüfe ob Sections direkt vorhanden sind (Legacy-Struktur) elif "sections" in doc: for section in doc.get("sections", []): contentPartIds = section.get("contentPartIds", []) if contentPartIds: section["contentPartsMetadata"] = [] for partId in contentPartIds: if partId in contentPartsMap: section["contentPartsMetadata"].append(contentPartsMap[partId]) return structure def _flattenChaptersToSections( self, chapterStructure: Dict[str, Any] ) -> Dict[str, Any]: """ Flattening: Konvertiert Chapters zu finaler Section-Struktur. Jedes Chapter wird zu einer Heading-Section + dessen Sections. """ result = { "metadata": chapterStructure.get("metadata", {}), "documents": [] } for doc in chapterStructure.get("documents", []): flattened_doc = { "id": doc.get("id"), "title": doc.get("title"), "filename": doc.get("filename"), "sections": [] } for chapter in doc.get("chapters", []): # 1. Vordefinierte Heading-Section für Chapter-Title heading_section = { "id": f"{chapter['id']}_heading", "content_type": "heading", "elements": [{ "type": "heading", "content": chapter.get("title"), "level": chapter.get("level", 1) }] } flattened_doc["sections"].append(heading_section) # 2. Generierte Sections flattened_doc["sections"].extend(chapter.get("sections", [])) result["documents"].append(flattened_doc) return result async def _fillStructureLegacy( self, structure: Dict[str, Any], contentParts: List[ContentPart], userPrompt: str, fillOperationId: str ) -> Dict[str, Any]: """ Legacy: Füllt Struktur mit Sections direkt (für Rückwärtskompatibilität). """ # Starte ChatLog self.services.chat.progressLogStart( fillOperationId, "Structure Filling (Legacy)", "Filling", f"Filling {len(structure.get('documents', [{}])[0].get('sections', []))} sections", parentOperationId=fillOperationId ) try: filledStructure = copy.deepcopy(structure) # Sammle alle Sections sections_to_process = [] all_sections_list = [] for doc in filledStructure.get("documents", []): doc_sections = doc.get("sections", []) all_sections_list.extend(doc_sections) for section in doc_sections: sections_to_process.append((doc, section)) # Verarbeite Sections (bestehende Logik) for sectionIndex, (doc, section) in enumerate(sections_to_process): sectionId = section.get("id") contentPartIds = section.get("contentPartIds", []) contentFormats = section.get("contentFormats", {}) generationHint = section.get("generation_hint") contentType = section.get("content_type", "paragraph") elements = [] # Prüfe ob Aggregation nötig ist needsAggregation = self._needsAggregation( contentType=contentType, contentPartCount=len(contentPartIds) ) if needsAggregation and generationHint: # Aggregation: Alle Parts zusammen verarbeiten sectionParts = [ self._findContentPartById(pid, contentParts) for pid in contentPartIds ] sectionParts = [p for p in sectionParts if p is not None] if sectionParts: # Filtere nur extracted Parts für Aggregation extractedParts = [ p for p in sectionParts if contentFormats.get(p.id, p.metadata.get("contentFormat")) == "extracted" ] nonExtractedParts = [ p for p in sectionParts if contentFormats.get(p.id, p.metadata.get("contentFormat")) != "extracted" ] # Verarbeite non-extracted Parts separat for part in nonExtractedParts: contentFormat = contentFormats.get(part.id, part.metadata.get("contentFormat")) if contentFormat == "reference": elements.append({ "type": "reference", "documentReference": part.metadata.get("documentReference"), "label": part.metadata.get("usageHint", part.label) }) elif contentFormat == "object": elements.append({ "type": part.typeGroup, "base64Data": part.data, "mimeType": part.mimeType, "altText": part.metadata.get("usageHint", part.label) }) # Aggregiere extracted Parts mit AI if extractedParts: generationPrompt = self._buildSectionGenerationPrompt( section=section, contentParts=extractedParts, userPrompt=userPrompt, generationHint=generationHint, allSections=all_sections_list, sectionIndex=sectionIndex, isAggregation=True ) sectionOperationId = f"{fillOperationId}_section_{sectionId}" self.services.chat.progressLogStart( sectionOperationId, "Section Generation (Aggregation)", "Section", f"Generating section {sectionId} with {len(extractedParts)} parts", parentOperationId=fillOperationId ) try: self.services.utils.writeDebugFile( generationPrompt, f"section_content_{sectionId}_prompt" ) request = AiCallRequest( prompt=generationPrompt, contentParts=extractedParts, options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.BALANCED, processingMode=ProcessingModeEnum.DETAILED ) ) aiResponse = await self.aiService.callAi(request) self.services.utils.writeDebugFile( aiResponse.content, f"section_content_{sectionId}_response" ) generatedElements = json.loads( self.services.utils.jsonExtractString(aiResponse.content) ) if isinstance(generatedElements, list): elements.extend(generatedElements) elif isinstance(generatedElements, dict) and "elements" in generatedElements: elements.extend(generatedElements["elements"]) self.services.chat.progressLogFinish(sectionOperationId, True) except Exception as e: self.services.chat.progressLogFinish(sectionOperationId, False) elements.append({ "type": "error", "message": f"Error generating section {sectionId}: {str(e)}", "sectionId": sectionId }) logger.error(f"Error generating section {sectionId}: {str(e)}") else: # Einzelverarbeitung: Jeder Part einzeln for partId in contentPartIds: part = self._findContentPartById(partId, contentParts) if not part: continue contentFormat = contentFormats.get(partId, part.metadata.get("contentFormat")) if contentFormat == "reference": elements.append({ "type": "reference", "documentReference": part.metadata.get("documentReference"), "label": part.metadata.get("usageHint", part.label) }) elif contentFormat == "object": elements.append({ "type": part.typeGroup, "base64Data": part.data, "mimeType": part.mimeType, "altText": part.metadata.get("usageHint", part.label) }) elif contentFormat == "extracted": if generationHint: # AI-Call mit einzelnen ContentPart generationPrompt = self._buildSectionGenerationPrompt( section=section, contentParts=[part], userPrompt=userPrompt, generationHint=generationHint, allSections=all_sections_list, sectionIndex=sectionIndex, isAggregation=False ) sectionOperationId = f"{fillOperationId}_section_{sectionId}" self.services.chat.progressLogStart( sectionOperationId, "Section Generation", "Section", f"Generating section {sectionId}", parentOperationId=fillOperationId ) try: self.services.utils.writeDebugFile( generationPrompt, f"section_content_{sectionId}_prompt" ) request = AiCallRequest( prompt=generationPrompt, contentParts=[part], options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.BALANCED, processingMode=ProcessingModeEnum.DETAILED ) ) aiResponse = await self.aiService.callAi(request) self.services.utils.writeDebugFile( aiResponse.content, f"section_content_{sectionId}_response" ) generatedElements = json.loads( self.services.utils.jsonExtractString(aiResponse.content) ) if isinstance(generatedElements, list): elements.extend(generatedElements) elif isinstance(generatedElements, dict) and "elements" in generatedElements: elements.extend(generatedElements["elements"]) self.services.chat.progressLogFinish(sectionOperationId, True) except Exception as e: self.services.chat.progressLogFinish(sectionOperationId, False) elements.append({ "type": "error", "message": f"Error generating section {sectionId}: {str(e)}", "sectionId": sectionId }) logger.error(f"Error generating section {sectionId}: {str(e)}") else: elements.append({ "type": "extracted_text", "content": part.data, "source": part.metadata.get("documentId"), "extractionPrompt": part.metadata.get("extractionPrompt") }) section["elements"] = elements # Füge ContentParts-Metadaten zur Struktur hinzu (für Validierung) filledStructure = self._addContentPartsMetadata(filledStructure, contentParts) self.services.chat.progressLogFinish(fillOperationId, True) return filledStructure except Exception as e: self.services.chat.progressLogFinish(fillOperationId, False) logger.error(f"Error in _fillStructureLegacy: {str(e)}") raise def _buildChapterSectionsStructurePrompt( self, chapterId: str, chapterLevel: int, chapterTitle: str, generationHint: str, contentPartIds: List[str], contentPartInstructions: Dict[str, Any], contentParts: List[ContentPart], userPrompt: str ) -> str: """Baue Prompt für Chapter-Sections-Struktur-Generierung.""" # Baue ContentParts-Index (nur IDs, keine Previews!) contentPartsIndex = "" for partId in contentPartIds: part = self._findContentPartById(partId, contentParts) if not part: continue contentFormat = part.metadata.get("contentFormat", "unknown") instruction = contentPartInstructions.get(partId, {}).get("instruction", "Use content as needed") contentPartsIndex += f"\n- ContentPart ID: {partId}\n" contentPartsIndex += f" Format: {contentFormat}\n" contentPartsIndex += f" Type: {part.typeGroup}\n" contentPartsIndex += f" Instruction: {instruction}\n" if not contentPartsIndex: contentPartsIndex = "\n(No content parts specified for this chapter)" prompt = f"""TASK: Generate Chapter Sections Structure CHAPTER METADATA: - Chapter ID: {chapterId} - Chapter Level: {chapterLevel} - Chapter Title: {chapterTitle} - Generation Hint: {generationHint} WICHTIG: Chapter hat bereits vordefinierte Heading-Section. Generiere NICHT eine Heading-Section für Chapter-Title! AVAILABLE CONTENT PARTS: {contentPartsIndex} STANDARD JSON SCHEMA FOR SECTIONS: Supported content_types: table, bullet_list, heading, paragraph, code_block, image Return JSON: {{ "sections": [ {{ "id": "section_1", "content_type": "paragraph", "contentPartIds": ["part_ext_1"], "generationHint": "...", "useAiCall": false, "elements": [] }} ] }} CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON. """ return prompt def _buildSectionGenerationPrompt( self, section: Dict[str, Any], contentParts: List[Optional[ContentPart]], userPrompt: str, generationHint: str, allSections: Optional[List[Dict[str, Any]]] = None, sectionIndex: Optional[int] = None, isAggregation: bool = False ) -> str: """Baue Prompt für Section-Generierung mit vollständigem Kontext.""" # Filtere None-Werte validParts = [p for p in contentParts if p is not None] # Section-Metadaten sectionId = section.get("id", "unknown") contentType = section.get("content_type", "paragraph") # Baue ContentParts-Beschreibung contentPartsText = "" if isAggregation: # Aggregation: Zeige nur Metadaten, nicht Previews contentPartsText += f"\n## CONTENT PARTS (Aggregation)\n" contentPartsText += f"- Anzahl: {len(validParts)} ContentParts\n" contentPartsText += f"- Alle ContentParts werden als Parameter übergeben (nicht im Prompt!)\n" contentPartsText += f"- Jeder Part kann sehr groß sein → Chunking automatisch\n" contentPartsText += f"- WICHTIG: Aggregiere ALLE Parts zu einem Element (z.B. eine Tabelle)\n\n" contentPartsText += f"ContentPart IDs:\n" for part in validParts: contentFormat = part.metadata.get("contentFormat", "unknown") contentPartsText += f" - {part.id} (Format: {contentFormat}, Type: {part.typeGroup}" if part.metadata.get("originalFileName"): contentPartsText += f", Source: {part.metadata.get('originalFileName')}" contentPartsText += ")\n" else: # Einzelverarbeitung: Zeige Previews for part in validParts: contentFormat = part.metadata.get("contentFormat", "unknown") contentPartsText += f"\n- ContentPart {part.id}:\n" contentPartsText += f" Format: {contentFormat}\n" contentPartsText += f" Type: {part.typeGroup}\n" if part.metadata.get("originalFileName"): contentPartsText += f" Source file: {part.metadata.get('originalFileName')}\n" if contentFormat == "extracted": # Zeige Preview von extrahiertem Text (länger für besseren Kontext) previewLength = 1000 if part.data: preview = part.data[:previewLength] + "..." if len(part.data) > previewLength else part.data contentPartsText += f" Content preview:\n```\n{preview}\n```\n" else: contentPartsText += f" Content: (empty)\n" elif contentFormat == "reference": contentPartsText += f" Reference: {part.metadata.get('documentReference')}\n" if part.metadata.get("usageHint"): contentPartsText += f" Usage hint: {part.metadata.get('usageHint')}\n" elif contentFormat == "object": dataLength = len(part.data) if part.data else 0 contentPartsText += f" Object type: {part.typeGroup}\n" contentPartsText += f" MIME type: {part.mimeType}\n" contentPartsText += f" Data size: {dataLength} chars (base64 encoded)\n" if part.metadata.get("usageHint"): contentPartsText += f" Usage hint: {part.metadata.get('usageHint')}\n" # Baue Section-Kontext (vorherige und nachfolgende Sections) contextText = "" if allSections and sectionIndex is not None: prevSections = [] nextSections = [] if sectionIndex > 0: for i in range(max(0, sectionIndex - 2), sectionIndex): prevSection = allSections[i] prevSections.append({ "id": prevSection.get("id"), "content_type": prevSection.get("content_type"), "generation_hint": prevSection.get("generation_hint", "")[:100] }) if sectionIndex < len(allSections) - 1: for i in range(sectionIndex + 1, min(len(allSections), sectionIndex + 3)): nextSection = allSections[i] nextSections.append({ "id": nextSection.get("id"), "content_type": nextSection.get("content_type"), "generation_hint": nextSection.get("generation_hint", "")[:100] }) if prevSections or nextSections: contextText = "\n## DOCUMENT CONTEXT\n" if prevSections: contextText += "\nPrevious sections:\n" for prev in prevSections: contextText += f"- {prev['id']} ({prev['content_type']}): {prev['generation_hint']}\n" if nextSections: contextText += "\nFollowing sections:\n" for next in nextSections: contextText += f"- {next['id']} ({next['content_type']}): {next['generation_hint']}\n" if isAggregation: prompt = f"""# TASK: Generate Section Content (Aggregation) ## SECTION METADATA - Section ID: {sectionId} - Content Type: {contentType} - Generation Hint: {generationHint} {contextText} ## USER REQUEST (for context) ``` {userPrompt} ``` ## AVAILABLE CONTENT FOR THIS SECTION {contentPartsText if contentPartsText else "(No content parts specified for this section)"} ## INSTRUCTIONS 1. Generate content for section "{sectionId}" based on the generation hint above 2. **AGGREGATION**: Combine ALL provided ContentParts into ONE element (e.g., one table with all data) 3. For table content_type: Create a single table with headers and rows from all ContentParts 4. For bullet_list content_type: Create a single list with items from all ContentParts 5. Format appropriately based on content_type ({contentType}) 6. Ensure the generated content fits logically between previous and following sections 7. Return ONLY a JSON object with an "elements" array 8. Each element should match the content_type: {contentType} ## OUTPUT FORMAT Return a JSON object with this structure: ```json {{ "elements": [ {{ "type": "{contentType}", "headers": [...], // if table "rows": [...], // if table "items": [...], // if bullet_list "content": "..." // if paragraph }} ] }} ``` CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON. """ else: prompt = f"""# TASK: Generate Section Content ## SECTION METADATA - Section ID: {sectionId} - Content Type: {contentType} - Generation Hint: {generationHint} {contextText} ## USER REQUEST (for context) ``` {userPrompt} ``` ## AVAILABLE CONTENT FOR THIS SECTION {contentPartsText if contentPartsText else "(No content parts specified for this section)"} ## INSTRUCTIONS 1. Generate content for section "{sectionId}" based on the generation hint above 2. Use the available content parts to populate this section 3. For images: Use data URI format (data:image/[type];base64,[data]) when embedding base64 image data 4. For extracted text: Format appropriately based on content_type ({contentType}) 5. Ensure the generated content fits logically between previous and following sections 6. Return ONLY a JSON object with an "elements" array 7. Each element should match the content_type: {contentType} ## OUTPUT FORMAT Return a JSON object with this structure: ```json {{ "elements": [ {{ "type": "{contentType}", "content": "..." }} ] }} ``` CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON. """ return prompt def _findContentPartById(self, partId: str, contentParts: List[ContentPart]) -> Optional[ContentPart]: """Finde ContentPart nach ID.""" for part in contentParts: if part.id == partId: return part return None def _needsAggregation( self, contentType: str, contentPartCount: int ) -> bool: """ Bestimmt ob mehrere ContentParts aggregiert werden müssen. Aggregation nötig wenn: - content_type erfordert Aggregation (table, bullet_list) - UND mehrere ContentParts vorhanden sind (> 1) Args: contentType: Section content_type contentPartCount: Anzahl der ContentParts in dieser Section Returns: True wenn Aggregation nötig, False sonst """ aggregationTypes = ["table", "bullet_list"] if contentType in aggregationTypes and contentPartCount > 1: return True # Optional: Auch für paragraph wenn mehrere Parts vorhanden # (z.B. Vergleich mehrerer Dokumente) # Standard: Keine Aggregation für paragraph return False