# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Content Generator for hierarchical document generation. Generates content for each section in the document structure. """ import logging import asyncio import json import base64 import re import traceback from typing import Dict, Any, Optional, List, Callable from .subContentIntegrator import ContentIntegrator from modules.workflows.processing.shared.stateTools import checkWorkflowStopped logger = logging.getLogger(__name__) class ContentGenerator: """Generates content for document sections""" def __init__(self, services: Any): self.services = services self.integrator = ContentIntegrator(services) async def generateContent( self, structure: Dict[str, Any], cachedContent: Optional[Dict[str, Any]] = None, userPrompt: str = "", contentParts: Optional[List[Any]] = None, progressCallback: Optional[Callable] = None, parallelGeneration: bool = True, batchSize: int = 10 ) -> Dict[str, Any]: """ Generate content for all sections in structure. Args: structure: Document structure from Phase 1 (with contentPartIds per section) cachedContent: Extracted content cache userPrompt: Original user prompt contentParts: List of all available ContentParts (for mapping by contentPartIds) progressCallback: Function to call for progress updates parallelGeneration: Enable parallel section generation batchSize: Number of sections to process in parallel Returns: Complete document structure with populated elements """ try: documents = structure.get("documents", []) if not documents: logger.warning("No documents found in structure") return structure allGeneratedSections = [] totalSectionsAcrossDocs = 0 # Count total sections for progress tracking for doc in documents: totalSectionsAcrossDocs += len(doc.get("sections", [])) if progressCallback: progressCallback(0, totalSectionsAcrossDocs, "Starting content generation...") currentSectionIndex = 0 for docIdx, doc in enumerate(documents): sections = doc.get("sections", []) totalSections = len(sections) if totalSections == 0: continue # Determine if parallel generation is beneficial # Use sequential if only 1 section or if sections depend on each other useParallel = parallelGeneration and totalSections > 1 # Count images - if many images, parallel is still beneficial but slower imageCount = sum(1 for s in sections if s.get("content_type") == "image") if progressCallback and docIdx > 0: progressCallback( currentSectionIndex, totalSectionsAcrossDocs, f"Processing document {docIdx + 1}/{len(documents)}..." ) if useParallel: # Generate in batches for parallel processing generatedSections = await self._generateSectionsParallel( sections=sections, cachedContent=cachedContent, userPrompt=userPrompt, contentParts=contentParts, # Pass ContentParts for section generation documentMetadata=structure.get("metadata", {}), progressCallback=lambda idx, total, msg: progressCallback( currentSectionIndex + idx, totalSectionsAcrossDocs, msg ) if progressCallback else None, batchSize=batchSize ) else: # Generate sequentially (better for context-dependent sections) generatedSections = await self._generateSectionsSequential( sections=sections, cachedContent=cachedContent, userPrompt=userPrompt, contentParts=contentParts, # Pass ContentParts for section generation documentMetadata=structure.get("metadata", {}), progressCallback=lambda idx, total, msg: progressCallback( currentSectionIndex + idx, totalSectionsAcrossDocs, msg ) if progressCallback else None ) allGeneratedSections.extend(generatedSections) currentSectionIndex += totalSections if progressCallback: progressCallback( totalSectionsAcrossDocs, totalSectionsAcrossDocs, "Content generation complete" ) # Integrate generated content into structure completeStructure = self.integrator.integrateContent( structure=structure, generatedSections=allGeneratedSections ) return completeStructure except Exception as e: logger.error(f"Error generating content: {str(e)}") raise async def _generateSectionsSequential( self, sections: List[Dict[str, Any]], cachedContent: Optional[Dict[str, Any]], userPrompt: str, contentParts: Optional[List[Any]] = None, documentMetadata: Dict[str, Any] = {}, progressCallback: Optional[Callable] = None ) -> List[Dict[str, Any]]: """ Generate sections sequentially with enhanced progress tracking. Uses previous sections for context continuity. """ generatedSections = [] previousSections = [] totalSections = len(sections) # Create ContentParts lookup map by ID contentPartsMap = {} if contentParts: for part in contentParts: partId = part.id if hasattr(part, 'id') else part.get('id', '') if partId: contentPartsMap[partId] = part for idx, section in enumerate(sections): checkWorkflowStopped(self.services) try: contentType = section.get("content_type", "content") sectionId = section.get("id", f"section_{idx}") # Enhanced progress message if contentType == "image": message = f"Generating image: {section.get('generation_hint', 'Image')[:50]}..." elif contentType == "heading": message = f"Generating heading..." elif contentType == "paragraph": message = f"Generating paragraph..." else: message = f"Generating {contentType}..." if progressCallback: progressCallback( idx + 1, totalSections, message ) # Get ContentParts for this section sectionContentPartIds = section.get("contentPartIds", []) sectionContentParts = [] if sectionContentPartIds and contentPartsMap: for partId in sectionContentPartIds: if partId in contentPartsMap: sectionContentParts.append(contentPartsMap[partId]) context = { "userPrompt": userPrompt, "cachedContent": cachedContent, "previousSections": previousSections.copy(), "targetSection": section, "sectionContentParts": sectionContentParts, # ContentParts for this section "documentMetadata": documentMetadata, "operationId": None } generated = await self._generateSectionContent(section, context) generatedSections.append(generated) previousSections.append(generated) # Log success if contentType == "image": logger.info(f"Successfully generated image for section {sectionId}") elif not generated.get("error"): logger.debug(f"Successfully generated {contentType} for section {sectionId}") except Exception as e: logger.error(f"Error generating section {section.get('id')}: {str(e)}") errorSection = self.integrator.createErrorSection(section, str(e)) generatedSections.append(errorSection) previousSections.append(errorSection) return generatedSections async def _generateSectionsParallel( self, sections: List[Dict[str, Any]], cachedContent: Optional[Dict[str, Any]], userPrompt: str, contentParts: Optional[List[Any]] = None, documentMetadata: Dict[str, Any] = {}, progressCallback: Optional[Callable] = None, batchSize: int = 10 ) -> List[Dict[str, Any]]: """ Generate sections in parallel batches with enhanced progress tracking. Args: sections: List of sections to generate cachedContent: Extracted content cache userPrompt: Original user prompt contentParts: List of all available ContentParts (for mapping by contentPartIds) documentMetadata: Document metadata progressCallback: Progress callback function batchSize: Number of sections to process in parallel per batch Returns: List of generated sections """ generatedSections = [] totalSections = len(sections) if totalSections == 0: return [] # Create ContentParts lookup map by ID contentPartsMap = {} if contentParts: for part in contentParts: partId = part.id if hasattr(part, 'id') else part.get('id', '') if partId: contentPartsMap[partId] = part # Adjust batch size based on section types (images take longer) imageCount = sum(1 for s in sections if s.get("content_type") == "image") if imageCount > 0: # Reduce batch size if many images (images are slower) adjustedBatchSize = min(batchSize, max(3, batchSize - imageCount // 2)) else: adjustedBatchSize = batchSize # Process in batches totalBatches = (totalSections + adjustedBatchSize - 1) // adjustedBatchSize accumulatedPreviousSections = [] # Track sections from previous batches for batchNum, batchStart in enumerate(range(0, totalSections, adjustedBatchSize)): batch = sections[batchStart:batchStart + adjustedBatchSize] batchEnd = min(batchStart + adjustedBatchSize, totalSections) if progressCallback: progressCallback( batchStart, totalSections, f"Processing batch {batchNum + 1}/{totalBatches} ({len(batch)} sections)..." ) async def generateWithProgress(section: Dict[str, Any], globalIndex: int, localIndex: int, batchPreviousSections: List[Dict[str, Any]]): checkWorkflowStopped(self.services) try: contentType = section.get("content_type", "content") sectionId = section.get("id", f"section_{globalIndex}") # Enhanced progress message based on content type if contentType == "image": message = f"Generating image: {section.get('generation_hint', 'Image')[:50]}..." elif contentType == "heading": message = f"Generating heading..." elif contentType == "paragraph": message = f"Generating paragraph..." else: message = f"Generating {contentType}..." if progressCallback: progressCallback( globalIndex + 1, totalSections, message ) # Get ContentParts for this section sectionContentPartIds = section.get("contentPartIds", []) sectionContentParts = [] if sectionContentPartIds and contentPartsMap: for partId in sectionContentPartIds: if partId in contentPartsMap: sectionContentParts.append(contentPartsMap[partId]) context = { "userPrompt": userPrompt, "cachedContent": cachedContent, "previousSections": batchPreviousSections.copy(), # Include sections from previous batches "targetSection": section, "sectionContentParts": sectionContentParts, # ContentParts for this section "documentMetadata": documentMetadata, "operationId": None # Can be set if needed for nested progress } result = await self._generateSectionContent(section, context) # Log success if contentType == "image": logger.info(f"Successfully generated image for section {sectionId}") elif not result.get("error"): logger.debug(f"Successfully generated {contentType} for section {sectionId}") return result except Exception as e: logger.error(f"Error generating section {section.get('id')}: {str(e)}") return self.integrator.createErrorSection(section, str(e)) # Generate batch in parallel # Pass accumulated previous sections to each task in this batch batchTasks = [ generateWithProgress(section, batchStart + idx, idx, accumulatedPreviousSections) for idx, section in enumerate(batch) ] batchResults = await asyncio.gather( *batchTasks, return_exceptions=True ) # Handle exceptions and collect results for idx, result in enumerate(batchResults): if isinstance(result, Exception): logger.error(f"Error in parallel generation batch {batchNum + 1}: {str(result)}") errorSection = self.integrator.createErrorSection(batch[idx], str(result)) generatedSections.append(errorSection) accumulatedPreviousSections.append(errorSection) # Add to accumulated for next batch else: generatedSections.append(result) accumulatedPreviousSections.append(result) # Add to accumulated for next batch # Update progress after batch completion if progressCallback: progressCallback( batchEnd, totalSections, f"Completed batch {batchNum + 1}/{totalBatches}" ) return generatedSections async def _generateSectionContent( self, section: Dict[str, Any], context: Dict[str, Any] ) -> Dict[str, Any]: """ Generate content for a single section. Args: section: Section to generate content for context: Generation context Returns: Section with populated elements array """ try: contentType = section.get("content_type", "") complexity = section.get("complexity", "simple") if contentType == "image": return await self._generateImageSection(section, context) elif complexity == "complex": return await self._generateComplexTextSection(section, context) else: return await self._generateSimpleSection(section, context) except Exception as e: logger.error(f"Error generating section {section.get('id')}: {str(e)}") return self.integrator.createErrorSection(section, str(e)) async def _generateSimpleSection( self, section: Dict[str, Any], context: Dict[str, Any] ) -> Dict[str, Any]: """Generate content for simple section (heading, paragraph)""" try: contentType = section.get("content_type", "") generationHint = section.get("generation_hint", "") # Create section-specific prompt sectionPrompt = self._createSectionPrompt(section, context) # Debug: Log section generation prompt (harmonisiert - keine Checks nötig) sectionId = section.get('id', 'unknown') contentType = section.get('content_type', 'unknown') self.services.utils.writeDebugFile( sectionPrompt, f"document_generation_section_{sectionId}_{contentType}_prompt" ) # Call AI to generate content from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum options = AiCallOptions( operationType=OperationTypeEnum.DATA_GENERATE, resultFormat="json" ) checkWorkflowStopped(self.services) aiResponse = await self.services.ai.callAiContent( prompt=sectionPrompt, options=options, outputFormat="json" ) # Debug: Log section generation response (harmonisiert - keine Checks nötig) sectionId = section.get('id', 'unknown') contentType = section.get('content_type', 'unknown') responseContent = '' if aiResponse: if hasattr(aiResponse, 'content') and aiResponse.content: responseContent = aiResponse.content elif hasattr(aiResponse, 'documents') and aiResponse.documents: responseContent = f"[Response has {len(aiResponse.documents)} documents]" else: responseContent = f"[Response object: {type(aiResponse).__name__}, attributes: {dir(aiResponse)}]" else: responseContent = '[No response object]' # Debug: Log section generation response (harmonisiert - keine Checks nötig) self.services.utils.writeDebugFile( responseContent, f"document_generation_section_{sectionId}_{contentType}_response" ) logger.debug(f"Logged section response for {sectionId} ({len(responseContent)} chars)") if not aiResponse or not aiResponse.content: logger.error(f"AI section generation returned empty response for section {sectionId}") logger.error(f"Response object: {aiResponse}, has content: {hasattr(aiResponse, 'content') if aiResponse else False}") raise ValueError("AI section generation returned empty response") # Extract JSON elements rawContent = aiResponse.content if aiResponse and aiResponse.content else "" if not rawContent or not rawContent.strip(): logger.error(f"AI section generation returned empty response for section {sectionId}") logger.error(f"Response object: {aiResponse}, content length: {len(rawContent) if rawContent else 0}") raise ValueError("AI section generation returned empty response") extractedJson = self.services.utils.jsonExtractString(rawContent) if not extractedJson or not extractedJson.strip(): logger.error(f"No JSON found in AI response for section {sectionId}") logger.error(f"Raw response (first 1000 chars): {rawContent[:1000]}") logger.error(f"Extracted JSON (first 500 chars): {extractedJson[:500] if extractedJson else 'None'}") raise ValueError("No JSON found in AI section response") # json is already imported at module level try: elementsData = json.loads(extractedJson) logger.debug(f"Parsed JSON for section {section.get('id')}: type={type(elementsData)}, keys={list(elementsData.keys()) if isinstance(elementsData, dict) else 'N/A'}") except json.JSONDecodeError as e: logger.error(f"Failed to parse JSON from AI response for section {section.get('id')}") logger.error(f"JSON decode error: {str(e)}") logger.error(f"Extracted JSON length: {len(extractedJson)} chars") logger.error(f"Extracted JSON (first 1000 chars): {extractedJson[:1000]}") if len(extractedJson) > 1000: logger.error(f"Extracted JSON (last 500 chars): {extractedJson[-500:]}") logger.error(f"Raw AI response length: {len(rawContent)} chars") logger.error(f"Raw AI response (first 1000 chars): {rawContent[:1000] if rawContent else 'None'}") # Try to recover from truncated JSON if it looks like it was cut off if "Expecting" in str(e) and ("delimiter" in str(e) or "value" in str(e)): # Check if JSON starts correctly but is truncated if extractedJson.strip().startswith('{"elements"'): logger.warning(f"JSON appears truncated, attempting recovery...") # Use closeJsonStructures which handles unterminated strings properly try: from modules.shared.jsonUtils import closeJsonStructures recoveredJson = closeJsonStructures(extractedJson) logger.info(f"Attempting to parse recovered JSON (closed structures)") logger.debug(f"Recovered JSON length: {len(recoveredJson)} chars (original: {len(extractedJson)} chars)") elementsData = json.loads(recoveredJson) logger.info(f"Successfully recovered JSON for section {section.get('id')}") except (json.JSONDecodeError, ValueError) as recoveryError: logger.error(f"JSON recovery failed: {str(recoveryError)}") logger.error(f"Recovered JSON (first 500 chars): {recoveredJson[:500] if 'recoveredJson' in locals() else 'N/A'}") logger.error(f"Recovered JSON (last 200 chars): {recoveredJson[-200:] if 'recoveredJson' in locals() else 'N/A'}") # Last resort: try to extract partial content and create minimal valid JSON try: # Try to extract text content before the truncation point # re is already imported at module level # Look for text field that might be partially complete textMatch = re.search(r'"text"\s*:\s*"([^"]*)', extractedJson) if textMatch: partialText = textMatch.group(1) # Create minimal valid JSON with truncated text marked elementsData = { "elements": [{ "text": partialText + "... [Content truncated due to token limit]" }] } logger.warning(f"Created minimal JSON structure with truncated text for section {section.get('id')}") else: # If no text found, create empty structure elementsData = {"elements": []} logger.warning(f"Created empty JSON structure for section {section.get('id')} due to recovery failure") except Exception as fallbackError: logger.error(f"Fallback recovery also failed: {str(fallbackError)}") # Check if raw response might be truncated if len(rawContent) <= len(extractedJson) + 100: # Raw content is similar length to extracted logger.warning(f"Raw AI response may be truncated (length: {len(rawContent)} chars)") logger.warning(f"Consider increasing max_tokens for AI calls or checking token limits") raise ValueError(f"Invalid JSON in AI response (truncated?): {str(e)}") else: raise ValueError(f"Invalid JSON in AI response: {str(e)}") else: raise ValueError(f"Invalid JSON in AI response: {str(e)}") # Extract elements array - handle various response formats elements = None if isinstance(elementsData, dict): # Try to find elements in various possible locations if "elements" in elementsData: elements = elementsData["elements"] elif "content" in elementsData and isinstance(elementsData["content"], list): # Some models return {"content": [...]} elements = elementsData["content"] elif "data" in elementsData and isinstance(elementsData["data"], list): # Some models return {"data": [...]} elements = elementsData["data"] elif len(elementsData) == 1: # Single key dict - might be the elements directly firstValue = list(elementsData.values())[0] if isinstance(firstValue, list): elements = firstValue else: # Try to convert entire dict to a single element logger.warning(f"AI returned dict without 'elements' key, attempting to convert: {list(elementsData.keys())}") # For heading/paragraph, create element from dict if contentType == "heading": text = elementsData.get("text") or elementsData.get("heading") or str(elementsData) level = elementsData.get("level", 1) elements = [{"level": level, "text": text}] elif contentType == "paragraph": text = elementsData.get("text") or elementsData.get("content") or str(elementsData) elements = [{"text": text}] else: # Try to create element from dict structure elements = [elementsData] elif isinstance(elementsData, list): elements = elementsData else: # Primitive value - wrap it logger.warning(f"AI returned primitive value, wrapping: {type(elementsData)}") if contentType == "heading": elements = [{"level": 1, "text": str(elementsData)}] elif contentType == "paragraph": elements = [{"text": str(elementsData)}] else: elements = [{"text": str(elementsData)}] if elements is None: logger.error(f"Could not extract elements from AI response. Response structure: {type(elementsData)}, keys: {list(elementsData.keys()) if isinstance(elementsData, dict) else 'N/A'}") logger.error(f"Full response (first 500 chars): {str(extractedJson)[:500]}") raise ValueError(f"Invalid elements format in AI response. Expected dict with 'elements' key or list, got: {type(elementsData)}") # Validate elements is a list if not isinstance(elements, list): logger.warning(f"Elements is not a list, converting: {type(elements)}") elements = [elements] # Update section with elements section["elements"] = elements return section except Exception as e: logger.error(f"Error generating simple section: {str(e)}") raise async def _generateImageSection( self, section: Dict[str, Any], context: Dict[str, Any] ) -> Dict[str, Any]: """Generate image for image section or include existing image""" try: # First, check if section has image ContentParts to integrate directly sectionContentParts = context.get("sectionContentParts", []) if sectionContentParts: # Look for image ContentParts for part in sectionContentParts: partTypeGroup = part.typeGroup if hasattr(part, 'typeGroup') else part.get('typeGroup', '') partMimeType = part.mimeType if hasattr(part, 'mimeType') else part.get('mimeType', '') isImage = partTypeGroup == "image" or (partMimeType and partMimeType.startswith("image/")) if isImage: # Extract image data from ContentPart partData = part.data if hasattr(part, 'data') else part.get('data', '') partId = part.id if hasattr(part, 'id') else part.get('id', '') # Get base64 data base64Data = None if isinstance(partData, str): # Check if it's already base64 or needs extraction if partData.startswith("data:image"): # Extract base64 from data URL base64Data = partData.split(",", 1)[1] if "," in partData else partData elif len(partData) > 100: # Likely base64 string base64Data = partData elif isinstance(partData, bytes): import base64 base64Data = base64.b64encode(partData).decode('utf-8') if base64Data: # Get caption from section (priority: section.caption > metadata.caption) caption = section.get("caption") or section.get("metadata", {}).get("caption") # Get alt text from ContentPart metadata or section altText = part.metadata.get("altText") if hasattr(part, 'metadata') else part.get('metadata', {}).get('altText') if not altText: altText = section.get("generation_hint", "Image") # Get mime type mimeType = partMimeType or "image/png" # Create image element with caption section["elements"] = [{ "type": "image", "content": { "base64Data": base64Data, "altText": altText, "caption": caption # Include caption from section }, "caption": caption # Also at element level for compatibility }] logger.info(f"Successfully integrated image from ContentPart {partId} for section {section.get('id')} with caption: {caption}") return section # Check if this is an existing image to include or render imageSource = section.get("image_source", "generate") if imageSource == "existing" or imageSource == "render": # Phase 4: Include existing image or render image from cachedContent imageRefId = section.get("image_reference_id") if not imageRefId: raise ValueError(f"Image section {section.get('id')} has image_source='{imageSource}' but no image_reference_id") cachedContent = context.get("cachedContent", {}) imageDocuments = cachedContent.get("imageDocuments", []) # Find the image document imageDoc = next((img for img in imageDocuments if img.get("id") == imageRefId), None) if not imageDoc: raise ValueError(f"Image document {imageRefId} not found in cachedContent.imageDocuments") # Create image element from existing/render image altText = imageDoc.get("altText", section.get("generation_hint", "Image")) mimeType = imageDoc.get("mimeType", "image/png") caption = section.get("caption") or section.get("metadata", {}).get("caption") # Use nested content structure for consistency with renderers section["elements"] = [{ "type": "image", "content": { "base64Data": imageDoc.get("base64Data"), "altText": altText, "caption": caption # Include caption in content structure }, "caption": caption # Also at element level for compatibility }] logger.info(f"Successfully integrated image {imageRefId} for section {section.get('id')} (source={imageSource})") return section # Generate new image (existing logic) imagePrompt = section.get("image_prompt") if not imagePrompt: # Try to create from generation_hint generationHint = section.get("generation_hint", "") if generationHint: imagePrompt = f"Create a professional illustration: {generationHint}" else: raise ValueError(f"Image section {section.get('id')} missing image_prompt and generation_hint") # Call AI service for image generation from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallPromptImage # json is already imported at module level # Create image generation prompt promptModel = AiCallPromptImage( prompt=imagePrompt, size="1024x1024", quality="standard", style="vivid" ) promptJson = promptModel.model_dump_json(exclude_none=True, indent=2) options = AiCallOptions( operationType=OperationTypeEnum.IMAGE_GENERATE, resultFormat="base64" ) # Log image generation start logger.info(f"Starting image generation for section {section.get('id')}: {imagePrompt[:100]}...") # Call AI for image generation checkWorkflowStopped(self.services) aiResponse = await self.services.ai.callAiContent( prompt=promptJson, options=options, outputFormat="base64" ) # Extract base64 image data base64Data = None if aiResponse and aiResponse.documents and len(aiResponse.documents) > 0: imageDoc = aiResponse.documents[0] base64Data = imageDoc.documentData logger.debug(f"Image data extracted from documents: {len(base64Data) if base64Data else 0} chars") # Fallback: check content field (might be base64 string) if not base64Data and aiResponse and aiResponse.content: base64Data = aiResponse.content logger.debug(f"Image data extracted from content: {len(base64Data) if base64Data else 0} chars") if not base64Data: raise ValueError("Image generation returned no data") # Validate base64 data try: # base64 is already imported at module level base64.b64decode(base64Data[:100], validate=True) # Validate first 100 chars except Exception as e: logger.warning(f"Image data may not be valid base64: {str(e)}") # Continue anyway - renderer will handle it # Create image element altText = section.get("generation_hint", "Image") if not altText or altText == "Image": # Use image_prompt as alt text if generation_hint is generic altText = section.get("image_prompt", "Image")[:100] # Limit length caption = section.get("caption") or section.get("metadata", {}).get("caption") # Use nested content structure for consistency with renderers section["elements"] = [{ "type": "image", "content": { "base64Data": base64Data, "altText": altText, "caption": caption # Include caption in content structure }, "caption": caption # Also at element level for compatibility }] logger.info(f"Successfully generated image for section {section.get('id')}") return section except Exception as e: logger.error(f"Error generating image section: {str(e)}") raise async def _generateComplexTextSection( self, section: Dict[str, Any], context: Dict[str, Any] ) -> Dict[str, Any]: """Generate content for complex text section (long chapter)""" # For now, use same approach as simple section # Can be enhanced later with chunking for very long content return await self._generateSimpleSection(section, context) def _createSectionPrompt( self, section: Dict[str, Any], context: Dict[str, Any] ) -> str: """Create sub-prompt for section content generation""" contentType = section.get("content_type", "") generationHint = section.get("generation_hint", "") extractionPrompt = section.get("extractionPrompt") # Optional extraction prompt for ContentParts userPrompt = context.get("userPrompt", "") cachedContent = context.get("cachedContent") previousSections = context.get("previousSections", []) sectionContentParts = context.get("sectionContentParts", []) # ContentParts for this section documentMetadata = context.get("documentMetadata", {}) # Get user language userLanguage = self._getUserLanguage() # Format cached content cachedContentText = "" if cachedContent and cachedContent.get("extractedContent"): cachedContentText = self._formatCachedContent(cachedContent) # Format ContentParts for this section contentPartsText = "" imagePartReferences = [] # Track image parts for text reference if sectionContentParts: try: partsList = [] imageIndex = 1 for part in sectionContentParts: partTypeGroup = part.typeGroup if hasattr(part, 'typeGroup') else part.get('typeGroup', '') partMimeType = part.mimeType if hasattr(part, 'mimeType') else part.get('mimeType', '') partId = part.id if hasattr(part, 'id') else part.get('id', '') partData = part.data if hasattr(part, 'data') else part.get('data', '') # Check if this is an image part isImage = partTypeGroup == "image" or (partMimeType and partMimeType.startswith("image/")) if contentType == "image" and isImage: # For image sections: include image data for integration partsList.append(f"- ContentPart {partId} (image): [Image data available for integration]") elif isImage: # For non-image sections: track for text reference imagePartReferences.append({ "id": partId, "index": imageIndex }) imageIndex += 1 # Don't include image data in prompt for non-image sections else: # For text/table/etc parts: include data preview dataPreview = str(partData)[:200] if partData else "[No data]" partsList.append(f"- ContentPart {partId} ({partTypeGroup}): {dataPreview}{'...' if partData and len(str(partData)) > 200 else ''}") if partsList: contentPartsText = "\n".join(partsList) # Add image reference instructions for non-image sections if imagePartReferences and contentType != "image": refText = ", ".join([f"Bild {ref['index']}" if userLanguage == "de" else f"Image {ref['index']}" for ref in imagePartReferences]) contentPartsText += f"\n\nNOTE: Reference images as text in the document language: {refText}" except Exception as e: logger.warning(f"Could not format ContentParts for section prompt: {str(e)}") contentPartsText = "" # Format previous sections for context previousSectionsText = "" if previousSections: formattedSections = [] for s in previousSections[-10:]: # Last 10 sections for context (increased from 5) prevContentType = s.get('content_type', 'unknown') # Use different variable name to avoid shadowing order = s.get('order', 0) hint = s.get('generation_hint', '') elements = s.get('elements', []) # Extract actual content from elements contentPreview = "" if elements: if prevContentType == "heading": # Extract heading text for elem in elements: if isinstance(elem, dict) and "text" in elem: contentPreview = f": \"{elem['text']}\"" break elif prevContentType == "paragraph": # Extract paragraph text (first 100 chars) for elem in elements: if isinstance(elem, dict) and "text" in elem: text = elem['text'] contentPreview = f": \"{text[:100]}{'...' if len(text) > 100 else ''}\"" break elif prevContentType == "bullet_list": # Extract bullet items for elem in elements: if isinstance(elem, dict) and "items" in elem: items = elem['items'] if items: contentPreview = f": {items[:3]}{'...' if len(items) > 3 else ''}" break formattedSections.append( f"- Section {order} ({prevContentType}){contentPreview}" ) previousSectionsText = "\n".join(formattedSections) prompt = f"""{'='*80} SECTION TO GENERATE: {'='*80} Type: {contentType} Hint: {generationHint} {'='*80} CONTEXT: - User Request: {userPrompt} - Previous Sections: {len(previousSections)} sections already generated - Document Title: {documentMetadata.get('title', 'Unknown')} {'='*80} PREVIOUS SECTIONS (for continuity): {'='*80} {previousSectionsText if previousSectionsText else "This is the first section."} {'='*80} {'='*80} EXTRACTED CONTENT (if available): {'='*80} {cachedContentText if cachedContentText else "None"} {'='*80} {'='*80} CONTENT PARTS FOR THIS SECTION: {'='*80} {contentPartsText if contentPartsText else "No ContentParts assigned to this section."} {'='*80} TASK: Generate content for this section ONLY. INSTRUCTIONS: 1. Generate content appropriate for section type: {contentType} 2. Use the generation hint: {generationHint} {f"3. Use extractionPrompt for ContentParts: {extractionPrompt}" if extractionPrompt else "3. Use ContentParts data if provided"} 4. Consider previous sections for continuity 5. Use extracted content if relevant 6. All content must be in the language '{userLanguage}' 7. {'For image sections: Integrate image ContentParts as visual elements' if contentType == "image" else 'For non-image sections: Reference image ContentParts as text (e.g., "siehe Bild 1" in German, "see Image 1" in English)'} 6. CRITICAL: Return ONLY a JSON object with an "elements" array. DO NOT return a full document structure. REQUIRED FORMAT - Return ONLY this structure: For heading: {{"elements": [{{"level": 1, "text": "Heading Text"}}]}} For paragraph: {{"elements": [{{"text": "Paragraph text content"}}]}} For table: {{"elements": [{{"headers": ["Col1", "Col2"], "rows": [["Row1", "Row2"]]}}]}} For bullet_list: {{"elements": [{{"items": ["Item 1", "Item 2"]}}]}} For code_block: {{"elements": [{{"code": "code content here", "language": "python"}}]}} CRITICAL RULES: - Return ONLY {{"elements": [...]}} - nothing else - DO NOT include "metadata", "documents", "sections", or any other fields - DO NOT return a full document structure - DO NOT add explanatory text before or after the JSON - The response must start with {{"elements": and end with }} - This is a SINGLE SECTION, not a full document """ return prompt def _formatCachedContent(self, cachedContent: Dict[str, Any]) -> str: """Format cached content for prompt inclusion""" try: extractedContent = cachedContent.get("extractedContent", []) if not extractedContent: return "No content extracted." formattedParts = [] for extracted in extractedContent: if hasattr(extracted, 'parts'): for part in extracted.parts: if hasattr(part, 'content'): formattedParts.append(part.content) elif isinstance(extracted, dict): formattedParts.append(str(extracted)) else: formattedParts.append(str(extracted)) return "\n\n".join(formattedParts) if formattedParts else "No content extracted." except Exception as e: logger.warning(f"Error formatting cached content: {str(e)}") return "Error formatting cached content." def _getUserLanguage(self) -> str: """Get user language for document generation""" try: if self.services: if hasattr(self.services, 'currentUserLanguage') and self.services.currentUserLanguage: return self.services.currentUserLanguage elif hasattr(self.services, 'user') and self.services.user and hasattr(self.services.user, 'language'): return self.services.user.language except Exception: pass return 'en' # Default fallback