serviceCenter = DI-Container (Resolver, Registry, Context) fuer Service-Instanziierung serviceHub = Consumer-facing Aggregation (DB-Interfaces, Runtime-State, lazy Service-Resolution via serviceCenter) - modules/serviceHub/ erstellt: ServiceHub, PublicService, getInterface() - 22 Consumer-Dateien migriert (routes, features, tests): imports von modules.services auf serviceHub bzw. serviceCenter umgestellt - resolver.py: legacy fallback auf altes services/ entfernt - modules/services/ komplett geloescht (83 Dateien inkl. dead code mainAiChat.py) - pre-extraction: progress callback durch chunk-pipeline propagiert, operationType DATA_EXTRACT->DATA_ANALYSE fuer guenstigeres Modell
2964 lines
153 KiB
Python
2964 lines
153 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Structure Filling Module
|
|
|
|
Handles filling document structure with content, including:
|
|
- Filling sections with content parts
|
|
- Building section generation prompts
|
|
- Aggregation logic
|
|
"""
|
|
import json
|
|
import logging
|
|
import copy
|
|
import asyncio
|
|
from typing import Dict, Any, List, Optional, Tuple
|
|
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum
|
|
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class StructureFiller:
|
|
"""Handles filling document structure with content."""
|
|
|
|
# Default concurrency limit for parallel generation (chapters/sections)
|
|
DEFAULT_MAX_CONCURRENT_GENERATION = 5
|
|
|
|
def __init__(self, services, aiService):
|
|
"""Initialize StructureFiller with service center and AI service access."""
|
|
self.services = services
|
|
self.aiService = aiService
|
|
|
|
def _getMaxConcurrentGeneration(self, options: Optional[AiCallOptions] = None) -> int:
|
|
"""Get max concurrent generation limit, configurable via options."""
|
|
if options and hasattr(options, 'maxConcurrentGeneration'):
|
|
return options.maxConcurrentGeneration
|
|
return self.DEFAULT_MAX_CONCURRENT_GENERATION
|
|
|
|
def _getUserLanguage(self) -> str:
|
|
"""Get user language for document generation"""
|
|
try:
|
|
if self.services:
|
|
# Prefer detected language if available (from user intention analysis)
|
|
if hasattr(self.services, 'currentUserLanguage') and self.services.currentUserLanguage:
|
|
return self.services.currentUserLanguage
|
|
# Fallback to user's preferred language
|
|
elif hasattr(self.services, 'user') and self.services.user and hasattr(self.services.user, 'language'):
|
|
return self.services.user.language
|
|
except Exception:
|
|
pass
|
|
return 'en' # Default fallback
|
|
|
|
def _getDocumentLanguage(self, structure: Dict[str, Any], documentId: str) -> str:
|
|
"""
|
|
Get language for a specific document from structure.
|
|
Falls back to user language if not specified.
|
|
|
|
Args:
|
|
structure: The document structure with documents array
|
|
documentId: The ID of the document to get language for
|
|
|
|
Returns:
|
|
ISO 639-1 language code (e.g., "de", "en", "fr")
|
|
"""
|
|
# Try to find document in structure
|
|
for doc in structure.get("documents", []):
|
|
if doc.get("id") == documentId:
|
|
docLanguage = doc.get("language")
|
|
if docLanguage:
|
|
return docLanguage
|
|
|
|
# Fallback to metadata language
|
|
metadataLanguage = structure.get("metadata", {}).get("language")
|
|
if metadataLanguage:
|
|
return metadataLanguage
|
|
|
|
# Fallback to user language
|
|
return self._getUserLanguage()
|
|
|
|
def _extractContentPartInfo(self, chapter: Dict[str, Any]) -> Tuple[List[str], Dict[str, Any]]:
|
|
"""
|
|
Extract contentPartIds and contentPartInstructions from chapter's contentParts structure.
|
|
|
|
Returns:
|
|
tuple: (contentPartIds list, contentPartInstructions dict)
|
|
"""
|
|
contentParts = chapter.get("contentParts", {})
|
|
contentPartIds = list(contentParts.keys())
|
|
# Extract instructions (entries with "instruction" field) and captions (entries with "caption" field)
|
|
contentPartInstructions = {}
|
|
for partId, partInfo in contentParts.items():
|
|
if isinstance(partInfo, dict):
|
|
if "instruction" in partInfo:
|
|
contentPartInstructions[partId] = {"instruction": partInfo["instruction"]}
|
|
elif "caption" in partInfo:
|
|
# For entries with only caption (no instruction), still add to dict so it's available
|
|
contentPartInstructions[partId] = {"caption": partInfo["caption"]}
|
|
return contentPartIds, contentPartInstructions
|
|
|
|
def _getContentPartCaption(self, chapter: Dict[str, Any], partId: str) -> Optional[str]:
|
|
"""
|
|
Get caption for a contentPart from chapter's contentParts structure.
|
|
Returns None if no caption is available.
|
|
|
|
Args:
|
|
chapter: Chapter dict
|
|
partId: ContentPart ID
|
|
|
|
Returns:
|
|
Caption string or None
|
|
"""
|
|
if "contentParts" in chapter:
|
|
contentParts = chapter.get("contentParts", {})
|
|
partInfo = contentParts.get(partId)
|
|
if isinstance(partInfo, dict) and "caption" in partInfo:
|
|
return partInfo["caption"]
|
|
return None
|
|
|
|
async def fillStructure(
|
|
self,
|
|
structure: Dict[str, Any],
|
|
contentParts: List[ContentPart],
|
|
userPrompt: str,
|
|
parentOperationId: str,
|
|
language: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Phase 5D: Chapter-Content-Generierung (Zwei-Phasen-Ansatz).
|
|
|
|
Phase 5D.1: Generiert Sections-Struktur für jedes Chapter
|
|
Phase 5D.2: Füllt Sections mit ContentParts
|
|
|
|
Args:
|
|
structure: Struktur-Dict mit documents und chapters (nicht sections!)
|
|
contentParts: Alle vorbereiteten ContentParts
|
|
userPrompt: User-Anfrage
|
|
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
|
|
language: Language identified from user intention analysis (e.g., "de", "en", "fr")
|
|
|
|
Returns:
|
|
Gefüllte Struktur mit elements in jeder Section (nach Flattening)
|
|
"""
|
|
# Erstelle Operation-ID für Struktur-Abfüllen
|
|
fillOperationId = f"{parentOperationId}_structure_filling"
|
|
|
|
# Validate structure has chapters
|
|
hasChapters = False
|
|
for doc in structure.get("documents", []):
|
|
if "chapters" in doc:
|
|
hasChapters = True
|
|
break
|
|
|
|
if not hasChapters:
|
|
error_msg = "Structure must have chapters. Legacy section-based structure is not supported."
|
|
logger.error(error_msg)
|
|
raise ValueError(error_msg)
|
|
|
|
# Get language from services (user intention analysis) or parameter
|
|
if language is None:
|
|
language = self._getUserLanguage()
|
|
logger.debug(f"Using language from services (user intention analysis): {language}")
|
|
else:
|
|
logger.debug(f"Using provided language parameter: {language}")
|
|
|
|
# Starte ChatLog mit Parent-Referenz
|
|
chapterCount = sum(len(doc.get("chapters", [])) for doc in structure.get("documents", []))
|
|
self.services.chat.progressLogStart(
|
|
fillOperationId,
|
|
"Chapter Content Generation",
|
|
"Filling",
|
|
f"Processing {chapterCount} chapters",
|
|
parentOperationId=parentOperationId
|
|
)
|
|
|
|
try:
|
|
filledStructure = copy.deepcopy(structure)
|
|
|
|
# Get options from AI service if available (for concurrency control)
|
|
# Default concurrency limit (16) will be used if options is None
|
|
options = None
|
|
# Note: Options can be passed via fillStructure if needed in the future
|
|
|
|
# Phase 5D.1: Sections-Struktur für jedes Chapter generieren
|
|
filledStructure = await self._generateChapterSectionsStructure(
|
|
filledStructure, contentParts, userPrompt, fillOperationId, language, options
|
|
)
|
|
|
|
# Phase 5D.2: Sections mit ContentParts füllen
|
|
filledStructure = await self._fillChapterSections(
|
|
filledStructure, contentParts, userPrompt, fillOperationId, language, options
|
|
)
|
|
|
|
# Flattening: Chapters zu Sections konvertieren
|
|
flattenedStructure = self._flattenChaptersToSections(filledStructure)
|
|
|
|
# Füge ContentParts-Metadaten zur Struktur hinzu (für Validierung)
|
|
flattenedStructure = self._addContentPartsMetadata(flattenedStructure, contentParts)
|
|
|
|
# State 4 Validation: Validate and auto-fix filled structure
|
|
# Validation 4.1: Filled structure missing 'documents' field
|
|
if "documents" not in flattenedStructure:
|
|
raise ValueError("Filled structure missing 'documents' field - cannot auto-fix")
|
|
|
|
for doc in flattenedStructure["documents"]:
|
|
# Validation 4.4: Verify language is preserved from input structure
|
|
# Language MUST be preserved from Phase 3 structure (validated in State 3)
|
|
if "language" not in doc:
|
|
raise ValueError(f"Document {doc.get('id')} missing language in filled structure - should have been preserved from Phase 3")
|
|
|
|
# Validate language format
|
|
if not isinstance(doc["language"], str) or len(doc["language"]) != 2:
|
|
raise ValueError(f"Document {doc.get('id')} has invalid language format in filled structure: {doc['language']} - should be 2-character ISO 639-1 code")
|
|
|
|
# CRITICAL: flattenedStructure has sections, not chapters!
|
|
# After flattening, chapters are converted to sections, so we need to validate sections directly
|
|
for section in doc.get("sections", []):
|
|
# Validation 4.2: Section missing 'elements' field
|
|
if "elements" not in section:
|
|
section["elements"] = []
|
|
logger.info(f"Section {section.get('id')} missing 'elements' - created empty list")
|
|
|
|
# Validation 4.3: Section has empty elements list - ALLOW (intentionally empty is OK)
|
|
# No action needed - empty elements are allowed
|
|
|
|
# ChatLog abschließen
|
|
self.services.chat.progressLogFinish(fillOperationId, True)
|
|
|
|
return flattenedStructure
|
|
|
|
except Exception as e:
|
|
self.services.chat.progressLogFinish(fillOperationId, False)
|
|
logger.error(f"Error in fillStructure: {str(e)}")
|
|
raise
|
|
|
|
async def _generateSingleChapterSectionsStructure(
|
|
self,
|
|
chapter: Dict[str, Any],
|
|
chapterIndex: int,
|
|
chapterId: str,
|
|
chapterLevel: int,
|
|
chapterTitle: str,
|
|
generationHint: str,
|
|
contentPartIds: List[str],
|
|
contentPartInstructions: Dict[str, Any],
|
|
contentParts: List[ContentPart],
|
|
userPrompt: str,
|
|
language: str,
|
|
outputFormat: str,
|
|
parentOperationId: str,
|
|
totalChapters: int
|
|
) -> None:
|
|
"""
|
|
Generate sections structure for a single chapter (used for parallel processing).
|
|
Modifies chapter dict in place.
|
|
"""
|
|
try:
|
|
# Update progress for chapter structure generation
|
|
progress = chapterIndex / totalChapters if totalChapters > 0 else 1.0
|
|
self.services.chat.progressLogUpdate(
|
|
parentOperationId,
|
|
progress,
|
|
f"Generating sections for Chapter {chapterIndex}/{totalChapters}: {chapterTitle}"
|
|
)
|
|
|
|
chapterPrompt = self._buildChapterSectionsStructurePrompt(
|
|
chapterId=chapterId,
|
|
chapterLevel=chapterLevel,
|
|
chapterTitle=chapterTitle,
|
|
generationHint=generationHint,
|
|
contentPartIds=contentPartIds,
|
|
contentPartInstructions=contentPartInstructions,
|
|
contentParts=contentParts,
|
|
userPrompt=userPrompt,
|
|
language=language,
|
|
outputFormat=outputFormat
|
|
)
|
|
|
|
# AI-Call für Chapter-Struktur-Generierung
|
|
# Note: Debug logging is handled by callAiPlanning
|
|
checkWorkflowStopped(self.services)
|
|
aiResponse = await self.aiService.callAiPlanning(
|
|
prompt=chapterPrompt,
|
|
debugType=f"chapter_structure_{chapterId}"
|
|
)
|
|
|
|
sectionsStructure = json.loads(
|
|
self.services.utils.jsonExtractString(aiResponse)
|
|
)
|
|
|
|
chapter["sections"] = sectionsStructure.get("sections", [])
|
|
|
|
# Setze useAiCall Flag (falls nicht von AI gesetzt)
|
|
# WICHTIG: useAiCall kann nur true sein, wenn mindestens ein ContentPart Format "extracted" hat!
|
|
# "object" und "reference" Formate werden direkt als Elemente hinzugefügt, benötigen kein AI.
|
|
for section in chapter["sections"]:
|
|
if "useAiCall" not in section:
|
|
contentType = section.get("content_type", "paragraph")
|
|
sectionContentPartIds = section.get("contentPartIds", [])
|
|
|
|
# Prüfe ob mindestens ein ContentPart Format "extracted" hat
|
|
hasExtractedPart = False
|
|
for partId in sectionContentPartIds:
|
|
part = self._findContentPartById(partId, contentParts)
|
|
if part:
|
|
contentFormat = part.metadata.get("contentFormat", "unknown")
|
|
if contentFormat == "extracted":
|
|
hasExtractedPart = True
|
|
break
|
|
|
|
# useAiCall kann nur true sein, wenn extracted Parts vorhanden sind
|
|
useAiCall = False
|
|
if hasExtractedPart:
|
|
# Prüfe ob Transformation nötig ist
|
|
useAiCall = contentType != "paragraph"
|
|
|
|
# Prüfe contentPartInstructions für Transformation
|
|
if not useAiCall:
|
|
for partId in sectionContentPartIds:
|
|
instruction = contentPartInstructions.get(partId, {}).get("instruction", "")
|
|
if instruction and instruction.lower() not in ["include full text", "include all content", "use full extracted text"]:
|
|
useAiCall = True
|
|
break
|
|
|
|
section["useAiCall"] = useAiCall
|
|
logger.debug(f"Section {section.get('id')}: useAiCall={useAiCall} (hasExtractedPart={hasExtractedPart}, contentType={contentType})")
|
|
|
|
# Update progress after chapter completion
|
|
progress = chapterIndex / totalChapters if totalChapters > 0 else 1.0
|
|
self.services.chat.progressLogUpdate(
|
|
parentOperationId,
|
|
progress,
|
|
f"Chapter {chapterIndex}/{totalChapters} completed: {chapterTitle}"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error generating sections structure for chapter {chapterId}: {str(e)}")
|
|
# Set empty sections on error
|
|
chapter["sections"] = []
|
|
# Update progress even on error
|
|
progress = chapterIndex / totalChapters if totalChapters > 0 else 1.0
|
|
self.services.chat.progressLogUpdate(
|
|
parentOperationId,
|
|
progress,
|
|
f"Chapter {chapterIndex}/{totalChapters} error: {chapterTitle}"
|
|
)
|
|
raise
|
|
|
|
async def _generateChapterSectionsStructure(
|
|
self,
|
|
chapterStructure: Dict[str, Any],
|
|
contentParts: List[ContentPart],
|
|
userPrompt: str,
|
|
parentOperationId: str,
|
|
language: str,
|
|
options: Optional[AiCallOptions] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Phase 5D.1: Generiert Sections-Struktur für jedes Chapter (ohne Content) in parallel.
|
|
Sections enthalten: content_type, contentPartIds, generationHint, useAiCall
|
|
"""
|
|
# Count total chapters for progress tracking
|
|
totalChapters = sum(len(doc.get("chapters", [])) for doc in chapterStructure.get("documents", []))
|
|
|
|
# Get concurrency limit
|
|
maxConcurrent = self._getMaxConcurrentGeneration(options)
|
|
semaphore = asyncio.Semaphore(maxConcurrent)
|
|
|
|
# Collect all chapters with their indices for parallel processing
|
|
chapterTasks = []
|
|
chapterIndex = 0
|
|
|
|
for doc in chapterStructure.get("documents", []):
|
|
docId = doc.get("id", "unknown")
|
|
# Get language for this specific document
|
|
docLanguage = self._getDocumentLanguage(chapterStructure, docId)
|
|
# Get output format for this specific document
|
|
docFormat = doc.get("outputFormat", "txt")
|
|
|
|
for chapter in doc.get("chapters", []):
|
|
chapterIndex += 1
|
|
chapterId = chapter.get("id", "unknown")
|
|
chapterLevel = chapter.get("level", 1)
|
|
chapterTitle = chapter.get("title", "Untitled Chapter")
|
|
generationHint = chapter.get("generationHint", "")
|
|
contentPartIds, contentPartInstructions = self._extractContentPartInfo(chapter)
|
|
|
|
# Create task for parallel processing with semaphore
|
|
async def processChapterWithSemaphore(chapter, chapterIndex, chapterId, chapterLevel, chapterTitle, generationHint, contentPartIds, contentPartInstructions, docLanguage, docFormat):
|
|
checkWorkflowStopped(self.services)
|
|
async with semaphore:
|
|
return await self._generateSingleChapterSectionsStructure(
|
|
chapter=chapter,
|
|
chapterIndex=chapterIndex,
|
|
chapterId=chapterId,
|
|
chapterLevel=chapterLevel,
|
|
chapterTitle=chapterTitle,
|
|
generationHint=generationHint,
|
|
contentPartIds=contentPartIds,
|
|
contentPartInstructions=contentPartInstructions,
|
|
contentParts=contentParts,
|
|
userPrompt=userPrompt,
|
|
language=docLanguage, # Use document-specific language
|
|
outputFormat=docFormat, # Use document-specific format
|
|
parentOperationId=parentOperationId,
|
|
totalChapters=totalChapters
|
|
)
|
|
|
|
task = processChapterWithSemaphore(
|
|
chapter, chapterIndex, chapterId, chapterLevel, chapterTitle, generationHint, contentPartIds, contentPartInstructions, docLanguage, docFormat
|
|
)
|
|
chapterTasks.append((chapterIndex, chapter, task))
|
|
|
|
# Execute all chapter tasks in parallel with concurrency control
|
|
if chapterTasks:
|
|
# Create list of tasks (without indices for gather)
|
|
tasks = [task for _, _, task in chapterTasks]
|
|
|
|
# Execute in parallel with error handling
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Process results in order and handle errors
|
|
for (originalIndex, originalChapter, _), result in zip(chapterTasks, results):
|
|
if isinstance(result, Exception):
|
|
logger.error(f"Error processing chapter {originalChapter.get('id')}: {str(result)}")
|
|
# Chapter already has empty sections set by _generateSingleChapterSectionsStructure
|
|
# Continue processing other chapters
|
|
|
|
return chapterStructure
|
|
|
|
async def _processAiResponseForSection(
|
|
self,
|
|
aiResponse: Any,
|
|
contentType: str,
|
|
operationType: OperationTypeEnum,
|
|
sectionId: str,
|
|
generationHint: str,
|
|
generatedElements: List[Dict[str, Any]],
|
|
section: Dict[str, Any]
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Helper method to process AI response and extract elements.
|
|
Handles both IMAGE_GENERATE and DATA_ANALYSE operation types.
|
|
"""
|
|
elements = []
|
|
|
|
# Handle IMAGE_GENERATE differently - returns image data directly
|
|
if contentType == "image" and operationType == OperationTypeEnum.IMAGE_GENERATE:
|
|
import base64
|
|
base64Data = ""
|
|
|
|
# Convert image data to base64 string if needed
|
|
if isinstance(aiResponse.content, bytes):
|
|
base64Data = base64.b64encode(aiResponse.content).decode('utf-8')
|
|
elif isinstance(aiResponse.content, str):
|
|
# Check if it's already a JSON structure
|
|
try:
|
|
jsonContent = json.loads(self.services.utils.jsonExtractString(aiResponse.content))
|
|
if isinstance(jsonContent, dict) and jsonContent.get("type") == "image":
|
|
elements.append(jsonContent)
|
|
logger.debug("AI returned proper JSON image structure")
|
|
base64Data = None # Signal that image was already processed
|
|
elif isinstance(jsonContent, list) and len(jsonContent) > 0:
|
|
if isinstance(jsonContent[0], dict) and jsonContent[0].get("type") == "image":
|
|
elements.extend(jsonContent)
|
|
logger.debug("AI returned proper JSON image structure in list")
|
|
base64Data = None # Signal that image was already processed
|
|
else:
|
|
base64Data = "" # Continue with normal processing
|
|
else:
|
|
base64Data = "" # Continue with normal processing
|
|
except (json.JSONDecodeError, ValueError, AttributeError):
|
|
base64Data = "" # Will be processed below
|
|
|
|
# Process base64 if not already handled above
|
|
if base64Data is None:
|
|
# Already processed as JSON, skip base64 processing
|
|
pass
|
|
elif aiResponse.content.startswith("data:image/"):
|
|
# Extract base64 from data URI
|
|
base64Data = aiResponse.content.split(",", 1)[1]
|
|
else:
|
|
content_stripped = aiResponse.content.strip()
|
|
if len(content_stripped) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=\n\r\t " for c in content_stripped[:200]):
|
|
base64Data = content_stripped.replace("\n", "").replace("\r", "").replace("\t", "").replace(" ", "")
|
|
else:
|
|
base64Data = aiResponse.content
|
|
else:
|
|
base64Data = ""
|
|
|
|
# Always create proper JSON structure for images (if not already processed)
|
|
if base64Data is None:
|
|
# Image already processed as JSON, skip
|
|
pass
|
|
elif base64Data:
|
|
# Get caption from section if available
|
|
caption = section.get("caption") or section.get("metadata", {}).get("caption") or ""
|
|
elements.append({
|
|
"type": "image",
|
|
"content": {
|
|
"base64Data": base64Data,
|
|
"altText": generationHint or "Generated image",
|
|
"caption": caption # Use caption from section if available
|
|
},
|
|
"caption": caption # Also at element level for compatibility
|
|
})
|
|
logger.debug(f"Created proper JSON image structure with base64Data length: {len(base64Data)}")
|
|
else:
|
|
logger.warning(f"IMAGE_GENERATE returned empty or invalid content for section {sectionId}")
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Image generation returned empty or invalid content",
|
|
"sectionId": sectionId
|
|
})
|
|
else:
|
|
# For non-image content: Use already parsed elements from _callAiWithLooping
|
|
if generatedElements:
|
|
elements.extend(generatedElements)
|
|
else:
|
|
# Fallback: Try to parse JSON response directly with repair logic
|
|
try:
|
|
from modules.shared.jsonUtils import tryParseJson, repairBrokenJson
|
|
|
|
# Use tryParseJson which handles extraction and basic parsing
|
|
fallbackElements, parseError, cleanedStr = tryParseJson(aiResponse.content)
|
|
|
|
# If parsing failed, try repair
|
|
if parseError and isinstance(aiResponse.content, str):
|
|
logger.warning(f"Initial JSON parse failed for section {sectionId}, attempting repair: {str(parseError)}")
|
|
repairedJson = repairBrokenJson(aiResponse.content)
|
|
if repairedJson:
|
|
fallbackElements = repairedJson
|
|
parseError = None
|
|
logger.info(f"Successfully repaired JSON for section {sectionId}")
|
|
|
|
if parseError:
|
|
raise parseError
|
|
|
|
if isinstance(fallbackElements, list):
|
|
elements.extend(fallbackElements)
|
|
elif isinstance(fallbackElements, dict) and "elements" in fallbackElements:
|
|
elements.extend(fallbackElements["elements"])
|
|
elif isinstance(fallbackElements, dict) and fallbackElements.get("type"):
|
|
elements.append(fallbackElements)
|
|
except (json.JSONDecodeError, ValueError) as json_error:
|
|
logger.error(f"Error parsing JSON response for section {sectionId}: {str(json_error)}")
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Failed to parse JSON response: {str(json_error)}",
|
|
"sectionId": sectionId
|
|
})
|
|
|
|
return elements
|
|
|
|
async def _processSingleSection(
|
|
self,
|
|
section: Dict[str, Any],
|
|
sectionIndex: int,
|
|
totalSections: int,
|
|
chapterIndex: int,
|
|
totalChapters: int,
|
|
chapterId: str,
|
|
chapterOperationId: str,
|
|
fillOperationId: str,
|
|
contentParts: List[ContentPart],
|
|
userPrompt: str,
|
|
all_sections_list: List[Dict[str, Any]],
|
|
language: str,
|
|
outputFormat: str = "txt",
|
|
calculateOverallProgress: callable = None,
|
|
preExtractedText: Optional[str] = None
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Process a single section and return its elements.
|
|
Used for parallel processing of sections within a chapter.
|
|
|
|
When preExtractedText is provided, the section uses the pre-extracted
|
|
content directly in its prompt instead of sending raw content parts
|
|
through the heavy extraction pipeline (avoids chunking + N*M AI calls).
|
|
"""
|
|
sectionId = section.get("id")
|
|
sectionTitle = section.get("title", sectionId)
|
|
contentPartIds = section.get("contentPartIds", [])
|
|
contentFormats = section.get("contentFormats", {})
|
|
generationHint = section.get("generationHint") or section.get("generation_hint")
|
|
contentType = section.get("content_type", "paragraph")
|
|
useAiCall = section.get("useAiCall", False)
|
|
|
|
# Update overall progress at start of section
|
|
overallProgress = calculateOverallProgress(chapterIndex - 1, totalChapters, sectionIndex, totalSections)
|
|
self.services.chat.progressLogUpdate(
|
|
fillOperationId,
|
|
overallProgress,
|
|
f"Chapter {chapterIndex}/{totalChapters}, Section {sectionIndex + 1}/{totalSections}: {sectionTitle}"
|
|
)
|
|
|
|
# WICHTIG: Wenn keine ContentParts vorhanden sind UND kein generationHint, kann kein AI-Call gemacht werden
|
|
if len(contentPartIds) == 0 and not generationHint:
|
|
useAiCall = False
|
|
logger.debug(f"Section {sectionId}: No content parts and no generation hint, setting useAiCall=False")
|
|
elif len(contentPartIds) == 0 and generationHint and not useAiCall:
|
|
useAiCall = True
|
|
logger.info(f"Section {sectionId}: Overriding useAiCall=True (has generationHint but no content parts)")
|
|
|
|
elements = []
|
|
|
|
# --- Fast path: use pre-extracted text instead of raw content parts ---
|
|
if preExtractedText and useAiCall and generationHint:
|
|
logger.info(
|
|
f"Section {sectionId}: Using pre-extracted text "
|
|
f"({len(preExtractedText):,} chars) - lightweight AI path"
|
|
)
|
|
|
|
for partId in contentPartIds:
|
|
part = self._findContentPartById(partId, contentParts)
|
|
if not part:
|
|
continue
|
|
cf = contentFormats.get(partId, part.metadata.get("contentFormat"))
|
|
if cf == "reference":
|
|
elements.append({
|
|
"type": "reference",
|
|
"documentReference": part.metadata.get("documentReference"),
|
|
"label": part.metadata.get("usageHint", part.label)
|
|
})
|
|
elif cf == "object":
|
|
if part.typeGroup == "image" and part.data:
|
|
caption = (
|
|
section.get("caption")
|
|
or section.get("metadata", {}).get("caption")
|
|
or part.metadata.get("caption", "")
|
|
)
|
|
elements.append({
|
|
"type": "image",
|
|
"content": {
|
|
"base64Data": part.data,
|
|
"altText": part.metadata.get("usageHint", part.label),
|
|
"caption": caption
|
|
},
|
|
"caption": caption
|
|
})
|
|
|
|
generationPrompt, templateStructure = self._buildSectionGenerationPrompt(
|
|
section=section,
|
|
contentParts=[],
|
|
userPrompt=userPrompt,
|
|
generationHint=generationHint,
|
|
allSections=all_sections_list,
|
|
sectionIndex=sectionIndex,
|
|
isAggregation=False,
|
|
language=language,
|
|
outputFormat=outputFormat,
|
|
preExtractedText=preExtractedText
|
|
)
|
|
|
|
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
|
self.services.chat.progressLogStart(
|
|
sectionOperationId,
|
|
"Section Generation (Pre-extracted)",
|
|
f"Section {sectionIndex + 1}/{totalSections}",
|
|
f"{sectionTitle} (pre-extracted)",
|
|
parentOperationId=chapterOperationId
|
|
)
|
|
|
|
try:
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
|
|
|
|
operationType = OperationTypeEnum.DATA_ANALYSE
|
|
options = AiCallOptions(
|
|
operationType=operationType,
|
|
priority=PriorityEnum.BALANCED,
|
|
processingMode=ProcessingModeEnum.DETAILED
|
|
)
|
|
|
|
checkWorkflowStopped(self.services)
|
|
aiResponseJson = await self.aiService.callAiWithLooping(
|
|
prompt=generationPrompt,
|
|
options=options,
|
|
debugPrefix=f"{chapterId}_section_{sectionId}",
|
|
promptBuilder=self.buildSectionPromptWithContinuation,
|
|
promptArgs={
|
|
"section": section,
|
|
"contentParts": [],
|
|
"userPrompt": userPrompt,
|
|
"generationHint": generationHint,
|
|
"allSections": all_sections_list,
|
|
"sectionIndex": sectionIndex,
|
|
"isAggregation": False,
|
|
"templateStructure": templateStructure,
|
|
"basePrompt": generationPrompt,
|
|
"language": language
|
|
},
|
|
operationId=sectionOperationId,
|
|
userPrompt=userPrompt,
|
|
contentParts=None,
|
|
useCaseId="section_content"
|
|
)
|
|
|
|
try:
|
|
from modules.shared.jsonUtils import tryParseJson, repairBrokenJson
|
|
if isinstance(aiResponseJson, str) and ("---" in aiResponseJson or aiResponseJson.count("```json") > 1):
|
|
generatedElements = self._extractAndMergeMultipleJsonBlocks(aiResponseJson, contentType, sectionId)
|
|
else:
|
|
parsedResponse, parseError, cleanedStr = tryParseJson(aiResponseJson)
|
|
if parsedResponse is None:
|
|
logger.warning(f"Section {sectionId}: tryParseJson failed, attempting repair")
|
|
repairedStr = repairBrokenJson(aiResponseJson)
|
|
parsedResponse, parseError2, _ = tryParseJson(repairedStr)
|
|
|
|
if parsedResponse and isinstance(parsedResponse, dict):
|
|
generatedElements = parsedResponse.get("elements", [])
|
|
elif parsedResponse and isinstance(parsedResponse, list):
|
|
generatedElements = parsedResponse
|
|
else:
|
|
generatedElements = []
|
|
except Exception as parseErr:
|
|
logger.error(f"Section {sectionId}: JSON parse error: {parseErr}")
|
|
generatedElements = []
|
|
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
|
|
|
|
class _AiResponse:
|
|
def __init__(self, content):
|
|
self.content = content
|
|
|
|
responseElements = await self._processAiResponseForSection(
|
|
aiResponse=_AiResponse(aiResponseJson),
|
|
contentType=contentType,
|
|
operationType=operationType,
|
|
sectionId=sectionId,
|
|
generationHint=generationHint,
|
|
generatedElements=generatedElements,
|
|
section=section
|
|
)
|
|
elements.extend(responseElements)
|
|
self.services.chat.progressLogFinish(sectionOperationId, True)
|
|
|
|
except Exception as e:
|
|
self.services.chat.progressLogFinish(sectionOperationId, False)
|
|
logger.error(f"Error in pre-extracted section {sectionId}: {e}")
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Error processing section {sectionId}: {str(e)}",
|
|
"sectionId": sectionId
|
|
})
|
|
|
|
return elements
|
|
|
|
# --- Standard path: process content parts directly ---
|
|
|
|
# Prüfe ob Aggregation nötig ist
|
|
needsAggregation = self._needsAggregation(
|
|
contentType=contentType,
|
|
contentPartCount=len(contentPartIds)
|
|
)
|
|
|
|
logger.info(f"Processing section {sectionId}: contentType={contentType}, contentPartCount={len(contentPartIds)}, useAiCall={useAiCall}, needsAggregation={needsAggregation}, hasGenerationHint={bool(generationHint)}")
|
|
|
|
try:
|
|
if needsAggregation and useAiCall:
|
|
# Aggregation: Alle Parts zusammen verarbeiten
|
|
sectionParts = [
|
|
self._findContentPartById(pid, contentParts)
|
|
for pid in contentPartIds
|
|
]
|
|
sectionParts = [p for p in sectionParts if p is not None]
|
|
|
|
if sectionParts:
|
|
# Filtere nur extracted Parts für Aggregation (reference/object werden separat behandelt)
|
|
extractedParts = [
|
|
p for p in sectionParts
|
|
if contentFormats.get(p.id, p.metadata.get("contentFormat")) == "extracted"
|
|
]
|
|
nonExtractedParts = [
|
|
p for p in sectionParts
|
|
if contentFormats.get(p.id, p.metadata.get("contentFormat")) != "extracted"
|
|
]
|
|
|
|
# Verarbeite non-extracted Parts separat (reference, object)
|
|
for part in nonExtractedParts:
|
|
contentFormat = contentFormats.get(part.id, part.metadata.get("contentFormat"))
|
|
|
|
if contentFormat == "reference":
|
|
elements.append({
|
|
"type": "reference",
|
|
"documentReference": part.metadata.get("documentReference"),
|
|
"label": part.metadata.get("usageHint", part.label)
|
|
})
|
|
elif contentFormat == "object":
|
|
if part.typeGroup == "image":
|
|
# Validate that image data exists
|
|
if not part.data:
|
|
logger.warning(f"Section {sectionId}: Image ContentPart {part.id} has no data (object format). Skipping image element.")
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Image ContentPart {part.id} has no data",
|
|
"sectionId": sectionId
|
|
})
|
|
else:
|
|
# Get caption from section (priority: section.caption > part.metadata.caption)
|
|
caption = section.get("caption") or section.get("metadata", {}).get("caption") or part.metadata.get("caption", "")
|
|
elements.append({
|
|
"type": "image",
|
|
"content": {
|
|
"base64Data": part.data,
|
|
"altText": part.metadata.get("usageHint", part.label),
|
|
"caption": caption # Use caption from section
|
|
},
|
|
"caption": caption # Also at element level for compatibility
|
|
})
|
|
else:
|
|
elements.append({
|
|
"type": part.typeGroup,
|
|
"content": {
|
|
"data": part.data,
|
|
"mimeType": part.mimeType,
|
|
"label": part.metadata.get("usageHint", part.label)
|
|
}
|
|
})
|
|
|
|
# Extract images with Vision AI if needed (before aggregation)
|
|
processedExtractedParts = []
|
|
for part in extractedParts:
|
|
# Check if this is an image that needs Vision AI extraction
|
|
if (part.typeGroup == "image" and
|
|
part.metadata.get("needsVisionExtraction") == True and
|
|
part.metadata.get("intent") == "extract"):
|
|
|
|
logger.info(f"Section {sectionId}: Extracting text from image {part.id} using Vision AI")
|
|
try:
|
|
extractionPrompt = part.metadata.get("extractionPrompt") or "Extract all text content from this image. Return only the extracted text, no additional formatting."
|
|
|
|
# Write debug file for image extraction prompt
|
|
if self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'):
|
|
try:
|
|
partId = part.id[:8] if part.id else "unknown"
|
|
partLabelSafe = (part.label or "image").replace(" ", "_").replace("/", "_").replace("\\", "_")[:30]
|
|
debugPrefix = f"extraction_image_{partId}_{partLabelSafe}"
|
|
self.services.utils.writeDebugFile(extractionPrompt, f"{debugPrefix}_prompt")
|
|
logger.debug(f"Wrote image extraction prompt debug file: {debugPrefix}_prompt")
|
|
except Exception as debugError:
|
|
logger.warning(f"Failed to write image extraction debug file: {str(debugError)}")
|
|
|
|
# Call Vision AI to extract text from image
|
|
visionRequest = AiCallRequest(
|
|
prompt=extractionPrompt,
|
|
context="",
|
|
options=AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE),
|
|
contentParts=[part]
|
|
)
|
|
|
|
checkWorkflowStopped(self.services)
|
|
visionResponse = await self.aiService.callAi(visionRequest)
|
|
|
|
# Write debug file for image extraction response
|
|
if self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'):
|
|
try:
|
|
partId = part.id[:8] if part.id else "unknown"
|
|
partLabelSafe = (part.label or "image").replace(" ", "_").replace("/", "_").replace("\\", "_")[:30]
|
|
debugPrefix = f"extraction_image_{partId}_{partLabelSafe}"
|
|
responseContent = visionResponse.content if visionResponse and visionResponse.content else ""
|
|
self.services.utils.writeDebugFile(responseContent, f"{debugPrefix}_response")
|
|
logger.debug(f"Wrote image extraction response debug file: {debugPrefix}_response")
|
|
except Exception as debugError:
|
|
logger.warning(f"Failed to write image extraction response debug file: {str(debugError)}")
|
|
|
|
if visionResponse and visionResponse.content:
|
|
# Create text part with extracted content
|
|
textPart = ContentPart(
|
|
id=f"vision_extracted_{part.id}",
|
|
label=f"Extracted text from {part.label or 'Image'}",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=visionResponse.content.strip(),
|
|
metadata={
|
|
**part.metadata,
|
|
"contentFormat": "extracted",
|
|
"extractionMethod": "vision",
|
|
"sourceImagePartId": part.id,
|
|
"needsVisionExtraction": False # Already extracted
|
|
}
|
|
)
|
|
processedExtractedParts.append(textPart)
|
|
logger.info(f"✅ Extracted text from image {part.id}: {len(visionResponse.content)} chars")
|
|
else:
|
|
logger.warning(f"⚠️ Vision AI extraction returned no content for image {part.id}")
|
|
# Keep original image part, but mark extraction as attempted
|
|
part.metadata["needsVisionExtraction"] = False
|
|
part.metadata["visionExtractionFailed"] = True
|
|
processedExtractedParts.append(part)
|
|
except Exception as e:
|
|
logger.error(f"❌ Vision AI extraction failed for image {part.id}: {str(e)}")
|
|
# Keep original image part, but mark extraction as attempted
|
|
part.metadata["needsVisionExtraction"] = False
|
|
part.metadata["visionExtractionFailed"] = True
|
|
processedExtractedParts.append(part)
|
|
else:
|
|
# Not an image needing extraction, or already processed
|
|
processedExtractedParts.append(part)
|
|
|
|
# Aggregiere extracted Parts mit AI (now with Vision-extracted text parts)
|
|
if processedExtractedParts:
|
|
logger.debug(f"Section {sectionId}: Aggregating {len(processedExtractedParts)} extracted parts with AI")
|
|
isAggregation = True
|
|
generationPrompt, templateStructure = self._buildSectionGenerationPrompt(
|
|
section=section,
|
|
contentParts=processedExtractedParts,
|
|
userPrompt=userPrompt,
|
|
generationHint=generationHint,
|
|
allSections=all_sections_list,
|
|
sectionIndex=sectionIndex,
|
|
isAggregation=isAggregation,
|
|
language=language,
|
|
outputFormat=outputFormat
|
|
)
|
|
|
|
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
|
self.services.chat.progressLogStart(
|
|
sectionOperationId,
|
|
"Section Generation (Aggregation)",
|
|
f"Section {sectionIndex + 1}/{totalSections}",
|
|
f"{sectionTitle} ({len(extractedParts)} parts)",
|
|
parentOperationId=chapterOperationId
|
|
)
|
|
|
|
try:
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt")
|
|
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
|
|
|
|
operationType = OperationTypeEnum.IMAGE_GENERATE if contentType == "image" else OperationTypeEnum.DATA_ANALYSE
|
|
|
|
if operationType == OperationTypeEnum.IMAGE_GENERATE:
|
|
maxPromptLength = 4000
|
|
if len(generationPrompt) > maxPromptLength:
|
|
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
|
|
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0]
|
|
|
|
# Write debug file for IMAGE_GENERATE (direct callAi, no _callAiWithLooping)
|
|
self.services.utils.writeDebugFile(
|
|
generationPrompt,
|
|
f"{chapterId}_section_{sectionId}_prompt"
|
|
)
|
|
|
|
request = AiCallRequest(
|
|
prompt=generationPrompt,
|
|
contentParts=[],
|
|
options=AiCallOptions(
|
|
operationType=operationType,
|
|
priority=PriorityEnum.BALANCED,
|
|
processingMode=ProcessingModeEnum.DETAILED
|
|
)
|
|
)
|
|
checkWorkflowStopped(self.services)
|
|
aiResponse = await self.aiService.callAi(request)
|
|
generatedElements = []
|
|
|
|
# Write debug file for IMAGE_GENERATE response (direct callAi, no _callAiWithLooping)
|
|
self.services.utils.writeDebugFile(
|
|
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
|
|
f"{chapterId}_section_{sectionId}_response"
|
|
)
|
|
else:
|
|
# Use consolidated class method
|
|
buildSectionPromptWithContinuation = self.buildSectionPromptWithContinuation
|
|
|
|
options = AiCallOptions(
|
|
operationType=operationType,
|
|
priority=PriorityEnum.BALANCED,
|
|
processingMode=ProcessingModeEnum.DETAILED
|
|
)
|
|
|
|
checkWorkflowStopped(self.services)
|
|
aiResponseJson = await self.aiService.callAiWithLooping(
|
|
prompt=generationPrompt,
|
|
options=options,
|
|
debugPrefix=f"{chapterId}_section_{sectionId}",
|
|
promptBuilder=buildSectionPromptWithContinuation,
|
|
promptArgs={
|
|
"section": section,
|
|
"contentParts": extractedParts,
|
|
"userPrompt": userPrompt,
|
|
"generationHint": generationHint,
|
|
"allSections": all_sections_list,
|
|
"sectionIndex": sectionIndex,
|
|
"isAggregation": isAggregation,
|
|
"templateStructure": templateStructure,
|
|
"basePrompt": generationPrompt
|
|
},
|
|
operationId=sectionOperationId,
|
|
userPrompt=userPrompt,
|
|
contentParts=extractedParts,
|
|
useCaseId="section_content" # REQUIRED: Explicit use case ID
|
|
)
|
|
|
|
try:
|
|
# Use tryParseJson which handles extraction and basic parsing
|
|
from modules.shared.jsonUtils import tryParseJson, repairBrokenJson
|
|
|
|
# Check if response contains multiple JSON blocks (separated by --- or multiple ```json blocks)
|
|
# This can happen when AI returns multiple complete responses
|
|
if isinstance(aiResponseJson, str) and ("---" in aiResponseJson or aiResponseJson.count("```json") > 1):
|
|
logger.info(f"Section {sectionId}: Detected multiple JSON blocks in response, attempting to merge")
|
|
generatedElements = self._extractAndMergeMultipleJsonBlocks(aiResponseJson, contentType, sectionId)
|
|
else:
|
|
parsedResponse, parseError, cleanedStr = tryParseJson(aiResponseJson)
|
|
|
|
# If parsing failed, try repair
|
|
if parseError and isinstance(aiResponseJson, str):
|
|
logger.warning(f"Initial JSON parse failed for section {sectionId}, attempting repair: {str(parseError)}")
|
|
repairedJson = repairBrokenJson(aiResponseJson)
|
|
if repairedJson:
|
|
parsedResponse = repairedJson
|
|
parseError = None
|
|
logger.info(f"Successfully repaired JSON for section {sectionId}")
|
|
|
|
if parseError:
|
|
raise parseError
|
|
|
|
if isinstance(parsedResponse, list):
|
|
generatedElements = parsedResponse
|
|
elif isinstance(parsedResponse, dict):
|
|
if "elements" in parsedResponse:
|
|
generatedElements = parsedResponse["elements"]
|
|
elif "sections" in parsedResponse and len(parsedResponse["sections"]) > 0:
|
|
firstSection = parsedResponse["sections"][0]
|
|
generatedElements = firstSection.get("elements", [])
|
|
elif parsedResponse.get("type"):
|
|
generatedElements = [parsedResponse]
|
|
else:
|
|
generatedElements = []
|
|
else:
|
|
generatedElements = []
|
|
|
|
class AiResponse:
|
|
def __init__(self, content):
|
|
self.content = content
|
|
|
|
aiResponse = AiResponse(aiResponseJson)
|
|
except Exception as parseError:
|
|
logger.error(f"Error parsing response from _callAiWithLooping for section {sectionId}: {str(parseError)}")
|
|
class AiResponse:
|
|
def __init__(self, content):
|
|
self.content = content
|
|
aiResponse = AiResponse(aiResponseJson)
|
|
generatedElements = []
|
|
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response")
|
|
# Note: Debug files are written by _callAiWithLooping using debugPrefix
|
|
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
|
|
|
|
# Process AI response
|
|
responseElements = await self._processAiResponseForSection(
|
|
aiResponse=aiResponse,
|
|
contentType=contentType,
|
|
operationType=operationType,
|
|
sectionId=sectionId,
|
|
generationHint=generationHint,
|
|
generatedElements=generatedElements,
|
|
section=section
|
|
)
|
|
elements.extend(responseElements)
|
|
|
|
self.services.chat.progressLogFinish(sectionOperationId, True)
|
|
|
|
chapterProgress = (sectionIndex + 1) / totalSections if totalSections > 0 else 1.0
|
|
self.services.chat.progressLogUpdate(
|
|
chapterOperationId,
|
|
chapterProgress,
|
|
f"Section {sectionIndex + 1}/{totalSections} completed"
|
|
)
|
|
|
|
except Exception as e:
|
|
self.services.chat.progressLogFinish(sectionOperationId, False)
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Error generating section {sectionId}: {str(e)}",
|
|
"sectionId": sectionId
|
|
})
|
|
logger.error(f"Error generating section {sectionId}: {str(e)}")
|
|
chapterProgress = (sectionIndex + 1) / totalSections if totalSections > 0 else 1.0
|
|
self.services.chat.progressLogUpdate(
|
|
chapterOperationId,
|
|
chapterProgress,
|
|
f"Section {sectionIndex + 1}/{totalSections} completed (with errors)"
|
|
)
|
|
|
|
else:
|
|
# Einzelverarbeitung: Jeder Part einzeln ODER Generation ohne ContentParts
|
|
if len(contentPartIds) == 0 and useAiCall and generationHint:
|
|
# Generate content from scratch using only generationHint
|
|
logger.debug(f"Processing section {sectionId}: No content parts, generating from generationHint only")
|
|
generationPrompt, templateStructure = self._buildSectionGenerationPrompt(
|
|
section=section,
|
|
contentParts=[],
|
|
userPrompt=userPrompt,
|
|
generationHint=generationHint,
|
|
allSections=all_sections_list,
|
|
sectionIndex=sectionIndex,
|
|
isAggregation=False,
|
|
language=language,
|
|
outputFormat=outputFormat
|
|
)
|
|
|
|
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
|
self.services.chat.progressLogStart(
|
|
sectionOperationId,
|
|
"Section Generation",
|
|
f"Section {sectionIndex + 1}/{totalSections}",
|
|
f"{sectionTitle} (from generationHint)",
|
|
parentOperationId=chapterOperationId
|
|
)
|
|
|
|
try:
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt")
|
|
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
|
|
|
|
operationType = OperationTypeEnum.IMAGE_GENERATE if contentType == "image" else OperationTypeEnum.DATA_ANALYSE
|
|
|
|
if operationType == OperationTypeEnum.IMAGE_GENERATE:
|
|
maxPromptLength = 4000
|
|
if len(generationPrompt) > maxPromptLength:
|
|
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
|
|
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0]
|
|
|
|
# Write debug file for IMAGE_GENERATE (direct callAi, no _callAiWithLooping)
|
|
self.services.utils.writeDebugFile(
|
|
generationPrompt,
|
|
f"{chapterId}_section_{sectionId}_prompt"
|
|
)
|
|
|
|
request = AiCallRequest(
|
|
prompt=generationPrompt,
|
|
contentParts=[],
|
|
options=AiCallOptions(
|
|
operationType=operationType,
|
|
priority=PriorityEnum.BALANCED,
|
|
processingMode=ProcessingModeEnum.DETAILED
|
|
)
|
|
)
|
|
aiResponse = await self.aiService.callAi(request)
|
|
generatedElements = []
|
|
|
|
# Write debug file for IMAGE_GENERATE response (direct callAi, no _callAiWithLooping)
|
|
self.services.utils.writeDebugFile(
|
|
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
|
|
f"{chapterId}_section_{sectionId}_response"
|
|
)
|
|
else:
|
|
isAggregation = False
|
|
|
|
# Use consolidated class method
|
|
buildSectionPromptWithContinuation = self.buildSectionPromptWithContinuation
|
|
|
|
options = AiCallOptions(
|
|
operationType=operationType,
|
|
priority=PriorityEnum.BALANCED,
|
|
processingMode=ProcessingModeEnum.DETAILED
|
|
)
|
|
|
|
aiResponseJson = await self.aiService.callAiWithLooping(
|
|
prompt=generationPrompt,
|
|
options=options,
|
|
debugPrefix=f"{chapterId}_section_{sectionId}",
|
|
promptBuilder=self.buildSectionPromptWithContinuation,
|
|
promptArgs={
|
|
"section": section,
|
|
"contentParts": [],
|
|
"userPrompt": userPrompt,
|
|
"generationHint": generationHint,
|
|
"allSections": all_sections_list,
|
|
"sectionIndex": sectionIndex,
|
|
"isAggregation": isAggregation,
|
|
"templateStructure": templateStructure,
|
|
"basePrompt": generationPrompt,
|
|
"language": language
|
|
},
|
|
operationId=sectionOperationId,
|
|
userPrompt=userPrompt,
|
|
contentParts=[],
|
|
useCaseId="section_content" # REQUIRED: Explicit use case ID
|
|
)
|
|
|
|
try:
|
|
parsedResponse = json.loads(self.services.utils.jsonExtractString(aiResponseJson))
|
|
if isinstance(parsedResponse, list):
|
|
generatedElements = parsedResponse
|
|
elif isinstance(parsedResponse, dict):
|
|
if "elements" in parsedResponse:
|
|
generatedElements = parsedResponse["elements"]
|
|
elif "sections" in parsedResponse and len(parsedResponse["sections"]) > 0:
|
|
firstSection = parsedResponse["sections"][0]
|
|
generatedElements = firstSection.get("elements", [])
|
|
elif parsedResponse.get("type"):
|
|
generatedElements = [parsedResponse]
|
|
else:
|
|
generatedElements = []
|
|
else:
|
|
generatedElements = []
|
|
|
|
class AiResponse:
|
|
def __init__(self, content):
|
|
self.content = content
|
|
|
|
aiResponse = AiResponse(aiResponseJson)
|
|
except Exception as parseError:
|
|
logger.error(f"Error parsing response from _callAiWithLooping for section {sectionId}: {str(parseError)}")
|
|
class AiResponse:
|
|
def __init__(self, content):
|
|
self.content = content
|
|
aiResponse = AiResponse(aiResponseJson)
|
|
generatedElements = []
|
|
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response")
|
|
# Note: Debug files are written by _callAiWithLooping using debugPrefix
|
|
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
|
|
|
|
responseElements = await self._processAiResponseForSection(
|
|
aiResponse=aiResponse,
|
|
contentType=contentType,
|
|
operationType=operationType,
|
|
sectionId=sectionId,
|
|
generationHint=generationHint,
|
|
generatedElements=generatedElements,
|
|
section=section
|
|
)
|
|
elements.extend(responseElements)
|
|
|
|
self.services.chat.progressLogFinish(sectionOperationId, True)
|
|
|
|
chapterProgress = (sectionIndex + 1) / totalSections if totalSections > 0 else 1.0
|
|
self.services.chat.progressLogUpdate(
|
|
chapterOperationId,
|
|
chapterProgress,
|
|
f"Section {sectionIndex + 1}/{totalSections} completed"
|
|
)
|
|
|
|
except Exception as e:
|
|
self.services.chat.progressLogFinish(sectionOperationId, False)
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Error generating section {sectionId}: {str(e)}",
|
|
"sectionId": sectionId
|
|
})
|
|
logger.error(f"Error generating section {sectionId}: {str(e)}")
|
|
chapterProgress = (sectionIndex + 1) / totalSections if totalSections > 0 else 1.0
|
|
self.services.chat.progressLogUpdate(
|
|
chapterOperationId,
|
|
chapterProgress,
|
|
f"Section {sectionIndex + 1}/{totalSections} completed (with errors)"
|
|
)
|
|
|
|
# Einzelverarbeitung: Jeder Part einzeln
|
|
for partId in contentPartIds:
|
|
part = self._findContentPartById(partId, contentParts)
|
|
if not part:
|
|
continue
|
|
|
|
contentFormat = contentFormats.get(partId, part.metadata.get("contentFormat"))
|
|
|
|
if contentFormat == "reference":
|
|
elements.append({
|
|
"type": "reference",
|
|
"documentReference": part.metadata.get("documentReference"),
|
|
"label": part.metadata.get("usageHint", part.label)
|
|
})
|
|
|
|
elif contentFormat == "object":
|
|
if part.typeGroup == "image":
|
|
# Validate that image data exists
|
|
if not part.data:
|
|
logger.warning(f"Section {sectionId}: Image ContentPart {part.id} has no data (object format). Skipping image element.")
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Image ContentPart {part.id} has no data",
|
|
"sectionId": sectionId
|
|
})
|
|
else:
|
|
# Get caption from section (priority: section.caption > part.metadata.caption)
|
|
caption = section.get("caption") or section.get("metadata", {}).get("caption") or part.metadata.get("caption", "")
|
|
elements.append({
|
|
"type": "image",
|
|
"content": {
|
|
"base64Data": part.data,
|
|
"altText": part.metadata.get("usageHint", part.label),
|
|
"caption": caption # Use caption from section
|
|
},
|
|
"caption": caption # Also at element level for compatibility
|
|
})
|
|
else:
|
|
elements.append({
|
|
"type": part.typeGroup,
|
|
"content": {
|
|
"data": part.data,
|
|
"mimeType": part.mimeType,
|
|
"label": part.metadata.get("usageHint", part.label)
|
|
}
|
|
})
|
|
|
|
elif contentFormat == "extracted":
|
|
# CRITICAL: If useAiCall is true, extracted parts are used as input for AI generation
|
|
# and should NOT be added as elements. Only add extracted text as element if useAiCall is false.
|
|
if useAiCall:
|
|
# Extracted part will be used as input for AI call - skip adding as element
|
|
logger.debug(f"Section {sectionId}: Skipping extracted part {part.id} as element (useAiCall=true, will be used as AI input)")
|
|
# Continue to process this part for AI call, but don't add as element yet
|
|
# Check if this is an image that needs Vision AI extraction
|
|
originalPartId = part.id
|
|
if (part.typeGroup == "image" and
|
|
part.metadata.get("needsVisionExtraction") == True and
|
|
part.metadata.get("intent") == "extract"):
|
|
|
|
logger.info(f"Section {sectionId}: Extracting text from single image {part.id} using Vision AI")
|
|
try:
|
|
extractionPrompt = part.metadata.get("extractionPrompt") or "Extract all text content from this image. Return only the extracted text, no additional formatting."
|
|
|
|
# Call Vision AI to extract text from image
|
|
visionRequest = AiCallRequest(
|
|
prompt=extractionPrompt,
|
|
context="",
|
|
options=AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE),
|
|
contentParts=[part]
|
|
)
|
|
|
|
checkWorkflowStopped(self.services)
|
|
visionResponse = await self.aiService.callAi(visionRequest)
|
|
|
|
if visionResponse and visionResponse.content:
|
|
# Replace image part with text part for further processing
|
|
part = ContentPart(
|
|
id=f"vision_extracted_{originalPartId}",
|
|
label=f"Extracted text from {part.label or 'Image'}",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=visionResponse.content.strip(),
|
|
metadata={
|
|
**part.metadata,
|
|
"contentFormat": "extracted",
|
|
"extractionMethod": "vision",
|
|
"sourceImagePartId": originalPartId,
|
|
"needsVisionExtraction": False # Already extracted
|
|
}
|
|
)
|
|
logger.info(f"✅ Extracted text from image {originalPartId}: {len(visionResponse.content)} chars")
|
|
else:
|
|
logger.warning(f"⚠️ Vision AI extraction returned no content for image {originalPartId}")
|
|
part.metadata["needsVisionExtraction"] = False
|
|
part.metadata["visionExtractionFailed"] = True
|
|
except Exception as e:
|
|
logger.error(f"❌ Vision AI extraction failed for image {originalPartId}: {str(e)}")
|
|
part.metadata["needsVisionExtraction"] = False
|
|
part.metadata["visionExtractionFailed"] = True
|
|
|
|
if useAiCall and generationHint:
|
|
# AI-Call mit einzelnen ContentPart (now may be text part after Vision extraction)
|
|
logger.debug(f"Processing section {sectionId}: Single extracted part with AI call")
|
|
generationPrompt, templateStructure = self._buildSectionGenerationPrompt(
|
|
section=section,
|
|
contentParts=[part],
|
|
userPrompt=userPrompt,
|
|
generationHint=generationHint,
|
|
allSections=all_sections_list,
|
|
sectionIndex=sectionIndex,
|
|
isAggregation=False,
|
|
language=language,
|
|
outputFormat=outputFormat
|
|
)
|
|
|
|
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
|
self.services.chat.progressLogStart(
|
|
sectionOperationId,
|
|
"Section Generation",
|
|
f"Section {sectionIndex + 1}/{totalSections}",
|
|
f"{sectionTitle} (single part)",
|
|
parentOperationId=chapterOperationId
|
|
)
|
|
|
|
try:
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.2, "Building generation prompt")
|
|
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
|
|
|
|
operationType = OperationTypeEnum.IMAGE_GENERATE if contentType == "image" else OperationTypeEnum.DATA_ANALYSE
|
|
|
|
if operationType == OperationTypeEnum.IMAGE_GENERATE:
|
|
maxPromptLength = 4000
|
|
if len(generationPrompt) > maxPromptLength:
|
|
logger.warning(f"Truncating DALL-E prompt from {len(generationPrompt)} to {maxPromptLength} characters")
|
|
generationPrompt = generationPrompt[:maxPromptLength].rsplit('\n', 1)[0]
|
|
|
|
# Write debug file for IMAGE_GENERATE (direct callAi, no _callAiWithLooping)
|
|
self.services.utils.writeDebugFile(
|
|
generationPrompt,
|
|
f"{chapterId}_section_{sectionId}_prompt"
|
|
)
|
|
|
|
request = AiCallRequest(
|
|
prompt=generationPrompt,
|
|
contentParts=[],
|
|
options=AiCallOptions(
|
|
operationType=operationType,
|
|
priority=PriorityEnum.BALANCED,
|
|
processingMode=ProcessingModeEnum.DETAILED
|
|
)
|
|
)
|
|
aiResponse = await self.aiService.callAi(request)
|
|
generatedElements = []
|
|
|
|
# Write debug file for IMAGE_GENERATE response (direct callAi, no _callAiWithLooping)
|
|
self.services.utils.writeDebugFile(
|
|
aiResponse.content if hasattr(aiResponse, 'content') else str(aiResponse),
|
|
f"{chapterId}_section_{sectionId}_response"
|
|
)
|
|
else:
|
|
isAggregation = False
|
|
|
|
# Use consolidated class method
|
|
buildSectionPromptWithContinuation = self.buildSectionPromptWithContinuation
|
|
|
|
options = AiCallOptions(
|
|
operationType=operationType,
|
|
priority=PriorityEnum.BALANCED,
|
|
processingMode=ProcessingModeEnum.DETAILED
|
|
)
|
|
|
|
aiResponseJson = await self.aiService.callAiWithLooping(
|
|
prompt=generationPrompt,
|
|
options=options,
|
|
debugPrefix=f"{chapterId}_section_{sectionId}",
|
|
promptBuilder=self.buildSectionPromptWithContinuation,
|
|
promptArgs={
|
|
"section": section,
|
|
"contentParts": [part],
|
|
"userPrompt": userPrompt,
|
|
"generationHint": generationHint,
|
|
"allSections": all_sections_list,
|
|
"sectionIndex": sectionIndex,
|
|
"isAggregation": isAggregation,
|
|
"services": self.services,
|
|
"templateStructure": templateStructure,
|
|
"basePrompt": generationPrompt,
|
|
"language": language
|
|
},
|
|
operationId=sectionOperationId,
|
|
userPrompt=userPrompt,
|
|
contentParts=[part],
|
|
useCaseId="section_content" # REQUIRED: Explicit use case ID
|
|
)
|
|
|
|
try:
|
|
parsedResponse = json.loads(self.services.utils.jsonExtractString(aiResponseJson))
|
|
if isinstance(parsedResponse, list):
|
|
generatedElements = parsedResponse
|
|
elif isinstance(parsedResponse, dict):
|
|
if "elements" in parsedResponse:
|
|
generatedElements = parsedResponse["elements"]
|
|
elif "sections" in parsedResponse and len(parsedResponse["sections"]) > 0:
|
|
firstSection = parsedResponse["sections"][0]
|
|
generatedElements = firstSection.get("elements", [])
|
|
elif parsedResponse.get("type"):
|
|
generatedElements = [parsedResponse]
|
|
else:
|
|
generatedElements = []
|
|
else:
|
|
generatedElements = []
|
|
|
|
class AiResponse:
|
|
def __init__(self, content):
|
|
self.content = content
|
|
|
|
aiResponse = AiResponse(aiResponseJson)
|
|
except Exception as parseError:
|
|
logger.error(f"Error parsing response from _callAiWithLooping for section {sectionId}: {str(parseError)}")
|
|
class AiResponse:
|
|
def __init__(self, content):
|
|
self.content = content
|
|
aiResponse = AiResponse(aiResponseJson)
|
|
generatedElements = []
|
|
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.6, "Processing AI response")
|
|
# Note: Debug files are written by _callAiWithLooping using debugPrefix
|
|
|
|
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
|
|
|
|
responseElements = await self._processAiResponseForSection(
|
|
aiResponse=aiResponse,
|
|
contentType=contentType,
|
|
operationType=operationType,
|
|
sectionId=sectionId,
|
|
generationHint=generationHint,
|
|
generatedElements=generatedElements,
|
|
section=section
|
|
)
|
|
elements.extend(responseElements)
|
|
|
|
self.services.chat.progressLogFinish(sectionOperationId, True)
|
|
|
|
chapterProgress = (sectionIndex + 1) / totalSections if totalSections > 0 else 1.0
|
|
self.services.chat.progressLogUpdate(
|
|
chapterOperationId,
|
|
chapterProgress,
|
|
f"Section {sectionIndex + 1}/{totalSections} completed"
|
|
)
|
|
|
|
except Exception as e:
|
|
self.services.chat.progressLogFinish(sectionOperationId, False)
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Error generating section {sectionId}: {str(e)}",
|
|
"sectionId": sectionId
|
|
})
|
|
logger.error(f"Error generating section {sectionId}: {str(e)}")
|
|
chapterProgress = (sectionIndex + 1) / totalSections if totalSections > 0 else 1.0
|
|
self.services.chat.progressLogUpdate(
|
|
chapterOperationId,
|
|
chapterProgress,
|
|
f"Section {sectionIndex + 1}/{totalSections} completed (with errors)"
|
|
)
|
|
else:
|
|
# Füge extrahierten Content direkt hinzu (kein AI-Call)
|
|
# CRITICAL: If content_type is "image", we must render an image, not extracted text
|
|
if contentType == "image":
|
|
# Section wants to display an image - find the image part
|
|
if part.typeGroup == "image":
|
|
# Direct image part - use it
|
|
logger.debug(f"Processing section {sectionId}: Single extracted IMAGE part WITHOUT AI call")
|
|
# Validate that image data exists
|
|
if not part.data:
|
|
logger.warning(f"Section {sectionId}: Image ContentPart {part.id} has no data (extracted format without AI call). Skipping image element.")
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Image ContentPart {part.id} has no data",
|
|
"sectionId": sectionId
|
|
})
|
|
else:
|
|
# Get caption from section (priority: section.caption > part.metadata.caption)
|
|
caption = section.get("caption") or section.get("metadata", {}).get("caption") or part.metadata.get("caption", "")
|
|
elements.append({
|
|
"type": "image",
|
|
"content": {
|
|
"base64Data": part.data,
|
|
"altText": part.metadata.get("usageHint", part.label),
|
|
"caption": caption # Use caption from section
|
|
},
|
|
"caption": caption # Also at element level for compatibility
|
|
})
|
|
elif part.typeGroup == "text" and part.metadata.get("sourceImagePartId"):
|
|
# This is a vision-extracted text part - find the original image object part
|
|
sourceImagePartId = part.metadata.get("sourceImagePartId")
|
|
logger.debug(f"Processing section {sectionId}: Found vision-extracted text part, looking for original image object part: {sourceImagePartId}")
|
|
|
|
# Try to find the object part (format: "obj_...")
|
|
objectPartId = part.metadata.get("relatedObjectPartId")
|
|
objectPart = None
|
|
|
|
if objectPartId:
|
|
objectPart = self._findContentPartById(objectPartId, contentParts)
|
|
|
|
# If not found via metadata, search through all contentParts for object part
|
|
if not objectPart:
|
|
# Search for object part that references the source image part ID
|
|
for candidatePart in contentParts:
|
|
if (candidatePart.metadata.get("contentFormat") == "object" and
|
|
candidatePart.typeGroup == "image" and
|
|
sourceImagePartId in candidatePart.id):
|
|
objectPart = candidatePart
|
|
objectPartId = candidatePart.id
|
|
logger.debug(f"Section {sectionId}: Found object part {objectPartId} by searching all contentParts")
|
|
break
|
|
|
|
if objectPart and objectPart.typeGroup == "image" and objectPart.data:
|
|
logger.info(f"Section {sectionId}: Found object part {objectPartId} for image rendering")
|
|
caption = section.get("caption") or section.get("metadata", {}).get("caption") or objectPart.metadata.get("caption", "")
|
|
elements.append({
|
|
"type": "image",
|
|
"content": {
|
|
"base64Data": objectPart.data,
|
|
"altText": objectPart.metadata.get("usageHint", objectPart.label),
|
|
"caption": caption
|
|
},
|
|
"caption": caption
|
|
})
|
|
else:
|
|
logger.warning(f"Section {sectionId}: No object part found for vision-extracted text part {part.id} (sourceImagePartId={sourceImagePartId}), cannot render image")
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Cannot render image: no object part found for extracted text part (sourceImagePartId={sourceImagePartId})",
|
|
"sectionId": sectionId
|
|
})
|
|
else:
|
|
logger.warning(f"Section {sectionId}: ContentPart {part.id} is not an image (typeGroup={part.typeGroup}), but section content_type is 'image'. Cannot render image.")
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Cannot render image: ContentPart is not an image type",
|
|
"sectionId": sectionId
|
|
})
|
|
else:
|
|
# content_type is not "image" - add extracted text as normal
|
|
if part.typeGroup == "image":
|
|
logger.debug(f"Processing section {sectionId}: Single extracted IMAGE part WITHOUT AI call")
|
|
# Validate that image data exists
|
|
if not part.data:
|
|
logger.warning(f"Section {sectionId}: Image ContentPart {part.id} has no data (extracted format without AI call). Skipping image element.")
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Image ContentPart {part.id} has no data",
|
|
"sectionId": sectionId
|
|
})
|
|
else:
|
|
# Get caption from section (priority: section.caption > part.metadata.caption)
|
|
caption = section.get("caption") or section.get("metadata", {}).get("caption") or part.metadata.get("caption", "")
|
|
elements.append({
|
|
"type": "image",
|
|
"content": {
|
|
"base64Data": part.data,
|
|
"altText": part.metadata.get("usageHint", part.label),
|
|
"caption": caption # Use caption from section
|
|
},
|
|
"caption": caption # Also at element level for compatibility
|
|
})
|
|
else:
|
|
logger.debug(f"Processing section {sectionId}: Single extracted TEXT part WITHOUT AI call")
|
|
elements.append({
|
|
"type": "extracted_text",
|
|
"content": part.data,
|
|
"source": part.metadata.get("documentId"),
|
|
"extractionPrompt": part.metadata.get("extractionPrompt")
|
|
})
|
|
|
|
# Update progress after section completion
|
|
chapterProgress = (sectionIndex + 1) / totalSections if totalSections > 0 else 1.0
|
|
self.services.chat.progressLogUpdate(
|
|
chapterOperationId,
|
|
chapterProgress,
|
|
f"Section {sectionIndex + 1}/{totalSections} completed"
|
|
)
|
|
|
|
overallProgress = calculateOverallProgress(chapterIndex - 1, totalChapters, sectionIndex + 1, totalSections)
|
|
self.services.chat.progressLogUpdate(
|
|
fillOperationId,
|
|
overallProgress,
|
|
f"Chapter {chapterIndex}/{totalChapters}, Section {sectionIndex + 1}/{totalSections} completed"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error processing section {sectionId}: {str(e)}")
|
|
elements.append({
|
|
"type": "error",
|
|
"message": f"Unexpected error processing section {sectionId}: {str(e)}",
|
|
"sectionId": sectionId
|
|
})
|
|
|
|
return elements
|
|
|
|
async def _preExtractSharedContent(
|
|
self,
|
|
contentParts: List[ContentPart],
|
|
allSectionTasks: List[Dict[str, Any]],
|
|
userPrompt: str,
|
|
parentOperationId: str
|
|
) -> Dict[str, str]:
|
|
"""
|
|
Pre-extract content from large/shared content parts ONCE before parallel
|
|
section filling. Returns dict mapping sectionId -> pre-extracted text.
|
|
|
|
Extracts a comprehensive plain-text summary per content part, then gives
|
|
ALL sections referencing that part the SAME summary. Each section's own
|
|
generationHint focuses the AI on the relevant aspect during generation.
|
|
|
|
This eliminates the N*M AI call explosion where N sections each independently
|
|
chunk and process the same M-byte content part through the extraction pipeline.
|
|
"""
|
|
SIZE_THRESHOLD = 100_000
|
|
MIN_SHARED_SECTIONS = 2
|
|
|
|
partToSections: Dict[str, List[Dict[str, Any]]] = {}
|
|
for task in allSectionTasks:
|
|
section = task["section"]
|
|
for partId in section.get("contentPartIds", []):
|
|
if partId not in partToSections:
|
|
partToSections[partId] = []
|
|
partToSections[partId].append(section)
|
|
|
|
if not partToSections:
|
|
return {}
|
|
|
|
preExtractedCache: Dict[str, str] = {}
|
|
|
|
for partId, sections in partToSections.items():
|
|
part = self._findContentPartById(partId, contentParts)
|
|
if not part:
|
|
continue
|
|
|
|
contentFormat = part.metadata.get("contentFormat", "unknown")
|
|
if contentFormat != "extracted":
|
|
continue
|
|
|
|
if part.typeGroup in ("image", "binary"):
|
|
continue
|
|
if part.mimeType and (
|
|
part.mimeType.startswith("image/")
|
|
or part.mimeType.startswith("video/")
|
|
or part.mimeType.startswith("audio/")
|
|
):
|
|
continue
|
|
|
|
partSize = len(part.data) if part.data else 0
|
|
numSections = len(sections)
|
|
|
|
if numSections < MIN_SHARED_SECTIONS and partSize < SIZE_THRESHOLD:
|
|
continue
|
|
|
|
fileName = part.metadata.get("originalFileName", partId)
|
|
logger.info(
|
|
f"Pre-extracting content part {partId} "
|
|
f"({partSize:,} bytes, referenced by {numSections} sections)"
|
|
)
|
|
|
|
topicLines = []
|
|
for section in sections:
|
|
hint = (
|
|
section.get("generationHint")
|
|
or section.get("generation_hint")
|
|
or section.get("title", "")
|
|
)
|
|
topicLines.append(f"- {hint}")
|
|
topicsText = "\n".join(topicLines)
|
|
|
|
extractionPrompt = (
|
|
"# TASK: Extract key information from this document\n\n"
|
|
"Extract ALL relevant information from the provided content as "
|
|
"plain text. The extracted content will be used to generate a report "
|
|
"covering the topics listed below.\n\n"
|
|
f"## User Request\n{userPrompt}\n\n"
|
|
f"## Report topics that need data\n{topicsText}\n\n"
|
|
"## Instructions\n"
|
|
"- Extract key facts, data points, timestamps, error messages, "
|
|
"statistics, and specific findings\n"
|
|
"- Organize by theme but output as PLAIN TEXT (not JSON)\n"
|
|
"- Be comprehensive but concise - include specific data, "
|
|
"skip generic filler\n"
|
|
"- Include concrete examples with exact values from the source\n"
|
|
"- Do NOT add commentary or analysis - just extract the raw data\n"
|
|
)
|
|
|
|
try:
|
|
self.services.chat.progressLogUpdate(
|
|
parentOperationId, 0.05,
|
|
f"Pre-extracting content from {fileName} ({partSize:,} bytes)..."
|
|
)
|
|
|
|
def _preExtractionProgress(chunkProgress, message):
|
|
mapped = 0.05 + chunkProgress * 0.05
|
|
self.services.chat.progressLogUpdate(
|
|
parentOperationId, mapped,
|
|
f"Pre-extraction: {message}"
|
|
)
|
|
|
|
request = AiCallRequest(
|
|
prompt=extractionPrompt,
|
|
contentParts=[part],
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
priority=PriorityEnum.BALANCED,
|
|
processingMode=ProcessingModeEnum.DETAILED
|
|
)
|
|
)
|
|
|
|
checkWorkflowStopped(self.services)
|
|
response = await self.aiService.callAi(request, progressCallback=_preExtractionProgress)
|
|
responseText = response.content if hasattr(response, "content") else str(response)
|
|
|
|
if responseText and len(responseText.strip()) > 50:
|
|
for section in sections:
|
|
sId = section.get("id", "unknown")
|
|
preExtractedCache[sId] = responseText
|
|
logger.info(
|
|
f"Pre-extraction of {partId} successful: "
|
|
f"{len(responseText):,} chars summary for {numSections} sections"
|
|
)
|
|
self.services.chat.progressLogUpdate(
|
|
parentOperationId, 0.10,
|
|
f"Pre-extraction complete ({len(responseText):,} chars). Starting section generation..."
|
|
)
|
|
else:
|
|
logger.warning(
|
|
f"Pre-extraction of {partId} returned empty/short response "
|
|
f"({len(responseText) if responseText else 0} chars), "
|
|
"sections will fall back to direct extraction"
|
|
)
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Pre-extraction of {partId} failed: {e}. "
|
|
"Sections will fall back to direct extraction."
|
|
)
|
|
|
|
if preExtractedCache:
|
|
logger.info(
|
|
f"Pre-extraction complete: {len(preExtractedCache)} sections "
|
|
"have pre-extracted content (will use lightweight AI path)"
|
|
)
|
|
|
|
return preExtractedCache
|
|
|
|
async def _fillChapterSections(
|
|
self,
|
|
chapterStructure: Dict[str, Any],
|
|
contentParts: List[ContentPart],
|
|
userPrompt: str,
|
|
parentOperationId: str,
|
|
language: str,
|
|
options: Optional[AiCallOptions] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Phase 5D.2: Füllt Sections mit ContentParts.
|
|
"""
|
|
|
|
# Sammle alle Sections für Kontext-Informationen (für alle Sections)
|
|
all_sections_list = []
|
|
for doc in chapterStructure.get("documents", []):
|
|
for chapter in doc.get("chapters", []):
|
|
for section in chapter.get("sections", []):
|
|
all_sections_list.append(section)
|
|
|
|
# Berechne Gesamtanzahl Chapters für Progress-Tracking
|
|
totalChapters = sum(len(doc.get("chapters", [])) for doc in chapterStructure.get("documents", []))
|
|
fillOperationId = parentOperationId
|
|
|
|
# Get concurrency limit for sections
|
|
maxConcurrent = self._getMaxConcurrentGeneration(options)
|
|
sectionSemaphore = asyncio.Semaphore(maxConcurrent)
|
|
|
|
# Collect ALL sections from ALL chapters for fully parallel processing
|
|
# Each task carries: (docId, chapterId, chapterTitle, sectionIndex, section, docLanguage)
|
|
allSectionTasks = []
|
|
totalSections = len(all_sections_list)
|
|
completedSections = [0] # Mutable counter for progress tracking
|
|
|
|
for doc in chapterStructure.get("documents", []):
|
|
docId = doc.get("id", "unknown")
|
|
docLanguage = self._getDocumentLanguage(chapterStructure, docId)
|
|
docFormat = doc.get("outputFormat", "txt") # Get output format for this document
|
|
|
|
for chapter in doc.get("chapters", []):
|
|
chapterId = chapter.get("id", "unknown")
|
|
chapterTitle = chapter.get("title", "Untitled Chapter")
|
|
sections = chapter.get("sections", [])
|
|
chapterSectionCount = len(sections)
|
|
|
|
for sectionIndex, section in enumerate(sections):
|
|
allSectionTasks.append({
|
|
"docId": docId,
|
|
"chapterId": chapterId,
|
|
"chapterTitle": chapterTitle,
|
|
"sectionIndex": sectionIndex,
|
|
"chapterSectionCount": chapterSectionCount,
|
|
"section": section,
|
|
"docLanguage": docLanguage,
|
|
"docFormat": docFormat # Include output format
|
|
})
|
|
|
|
MAX_TOTAL_SECTIONS = 35
|
|
if totalSections > MAX_TOTAL_SECTIONS:
|
|
logger.warning(
|
|
f"Structure has {totalSections} sections (limit {MAX_TOTAL_SECTIONS}). "
|
|
"Truncating to stay within budget."
|
|
)
|
|
allSectionTasks = allSectionTasks[:MAX_TOTAL_SECTIONS]
|
|
totalSections = len(allSectionTasks)
|
|
|
|
preExtractedCache = await self._preExtractSharedContent(
|
|
contentParts, allSectionTasks, userPrompt, fillOperationId
|
|
)
|
|
|
|
logger.info(f"Starting FULLY PARALLEL section generation: {totalSections} sections across {totalChapters} chapters")
|
|
|
|
# Create task wrapper for each section with progress tracking
|
|
async def processSectionWithSemaphore(taskInfo):
|
|
checkWorkflowStopped(self.services)
|
|
sectionId = taskInfo["section"].get("id", "unknown")
|
|
async with sectionSemaphore:
|
|
result = await self._processSingleSection(
|
|
section=taskInfo["section"],
|
|
sectionIndex=taskInfo["sectionIndex"],
|
|
totalSections=taskInfo["chapterSectionCount"],
|
|
chapterIndex=0,
|
|
totalChapters=totalChapters,
|
|
chapterId=taskInfo["chapterId"],
|
|
chapterOperationId=fillOperationId,
|
|
fillOperationId=fillOperationId,
|
|
contentParts=contentParts,
|
|
userPrompt=userPrompt,
|
|
all_sections_list=all_sections_list,
|
|
language=taskInfo["docLanguage"],
|
|
outputFormat=taskInfo.get("docFormat", "txt"),
|
|
calculateOverallProgress=lambda *args: completedSections[0] / totalSections if totalSections > 0 else 1.0,
|
|
preExtractedText=preExtractedCache.get(sectionId)
|
|
)
|
|
|
|
# Update progress after each section completes
|
|
completedSections[0] += 1
|
|
overallProgress = completedSections[0] / totalSections if totalSections > 0 else 1.0
|
|
sectionId = taskInfo["section"].get("id", "unknown")
|
|
self.services.chat.progressLogUpdate(
|
|
fillOperationId,
|
|
overallProgress,
|
|
f"Section {completedSections[0]}/{totalSections} completed: {sectionId}"
|
|
)
|
|
|
|
return result
|
|
|
|
# Create all tasks
|
|
tasks = [processSectionWithSemaphore(taskInfo) for taskInfo in allSectionTasks]
|
|
|
|
# Execute ALL sections in parallel with concurrency control
|
|
if tasks:
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Assign results back to sections
|
|
for taskInfo, result in zip(allSectionTasks, results):
|
|
section = taskInfo["section"]
|
|
if isinstance(result, Exception):
|
|
logger.error(f"Error processing section {section.get('id')}: {str(result)}")
|
|
section["elements"] = [{
|
|
"type": "error",
|
|
"message": f"Error processing section: {str(result)}",
|
|
"sectionId": section.get("id")
|
|
}]
|
|
else:
|
|
section["elements"] = result if result is not None else []
|
|
|
|
logger.info(f"Completed FULLY PARALLEL section generation: {totalSections} sections")
|
|
|
|
return chapterStructure
|
|
|
|
def _addContentPartsMetadata(
|
|
self,
|
|
structure: Dict[str, Any],
|
|
contentParts: List[ContentPart]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Fügt ContentParts-Metadaten zur Struktur hinzu, wenn contentPartIds vorhanden sind.
|
|
Dies hilft der Validierung, den Kontext der ContentParts zu verstehen.
|
|
"""
|
|
# Erstelle Mapping von ContentPart-ID zu Metadaten
|
|
contentPartsMap = {}
|
|
for part in contentParts:
|
|
contentPartsMap[part.id] = {
|
|
"id": part.id,
|
|
"format": part.metadata.get("contentFormat", "unknown"),
|
|
"type": part.typeGroup,
|
|
"mimeType": part.mimeType,
|
|
"originalFileName": part.metadata.get("originalFileName"),
|
|
"usageHint": part.metadata.get("usageHint"),
|
|
"documentId": part.metadata.get("documentId"),
|
|
"dataSize": len(str(part.data)) if part.data else 0
|
|
}
|
|
|
|
# Füge Metadaten zu Sections hinzu, die contentPartIds haben
|
|
for doc in structure.get("documents", []):
|
|
# Prüfe ob Chapters vorhanden sind (neue Struktur)
|
|
if "chapters" in doc:
|
|
for chapter in doc.get("chapters", []):
|
|
# Füge Metadaten zu Chapter-Level contentPartIds hinzu
|
|
chapterContentPartIds, _ = self._extractContentPartInfo(chapter)
|
|
if chapterContentPartIds:
|
|
chapter["contentPartsMetadata"] = []
|
|
for partId in chapterContentPartIds:
|
|
if partId in contentPartsMap:
|
|
chapter["contentPartsMetadata"].append(contentPartsMap[partId])
|
|
|
|
# Füge Metadaten zu Sections hinzu
|
|
for section in chapter.get("sections", []):
|
|
contentPartIds = section.get("contentPartIds", [])
|
|
if contentPartIds:
|
|
section["contentPartsMetadata"] = []
|
|
for partId in contentPartIds:
|
|
if partId in contentPartsMap:
|
|
section["contentPartsMetadata"].append(contentPartsMap[partId])
|
|
|
|
return structure
|
|
|
|
def _flattenChaptersToSections(
|
|
self,
|
|
chapterStructure: Dict[str, Any]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Flattening: Konvertiert Chapters zu finaler Section-Struktur.
|
|
Jedes Chapter wird zu einer Heading-Section (Level 1) + dessen Sections.
|
|
|
|
Chapters are the main structure elements (heading level 1).
|
|
All section headings with level < 2 are adjusted to level 2.
|
|
"""
|
|
result = {
|
|
"metadata": chapterStructure.get("metadata", {}),
|
|
"documents": []
|
|
}
|
|
|
|
for doc in chapterStructure.get("documents", []):
|
|
flattened_doc = {
|
|
"id": doc.get("id"),
|
|
"title": doc.get("title"),
|
|
"filename": doc.get("filename"),
|
|
"outputFormat": doc.get("outputFormat"), # Preserve from Phase 3
|
|
"language": doc.get("language"), # Preserve from Phase 3
|
|
"sections": []
|
|
}
|
|
|
|
for chapter in doc.get("chapters", []):
|
|
# 1. Vordefinierte Heading-Section für Chapter-Title (ALWAYS Level 1)
|
|
heading_section = {
|
|
"id": f"{chapter['id']}_heading",
|
|
"content_type": "heading",
|
|
"elements": [{
|
|
"type": "heading",
|
|
"content": {
|
|
"text": chapter.get("title", ""),
|
|
"level": 1 # Chapters are always level 1
|
|
}
|
|
}]
|
|
}
|
|
flattened_doc["sections"].append(heading_section)
|
|
|
|
# 2. Generierte Sections - adjust heading levels
|
|
for section in chapter.get("sections", []):
|
|
# CRITICAL: Ensure elements are preserved when flattening
|
|
# _adjustSectionHeadingLevels uses deepcopy which should preserve elements,
|
|
# but verify that elements exist in the source section
|
|
adjusted_section = self._adjustSectionHeadingLevels(section)
|
|
# Ensure elements are preserved (deepcopy should handle this, but double-check)
|
|
if "elements" in section and "elements" not in adjusted_section:
|
|
adjusted_section["elements"] = section["elements"]
|
|
flattened_doc["sections"].append(adjusted_section)
|
|
|
|
result["documents"].append(flattened_doc)
|
|
|
|
return result
|
|
|
|
def _adjustSectionHeadingLevels(self, section: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Adjust heading levels in sections: sections with type heading and level < 2 are changed to level 2.
|
|
Only chapter headings have level 1.
|
|
"""
|
|
adjusted_section = copy.deepcopy(section)
|
|
|
|
# Check if this is a heading section
|
|
if adjusted_section.get("content_type") == "heading":
|
|
elements = adjusted_section.get("elements", [])
|
|
for element in elements:
|
|
if isinstance(element, dict) and element.get("type") == "heading":
|
|
content = element.get("content", {})
|
|
if isinstance(content, dict):
|
|
level = content.get("level", 1)
|
|
# If level < 2, change to level 2 (only chapters have level 1)
|
|
if level < 2:
|
|
content["level"] = 2
|
|
|
|
return adjusted_section
|
|
|
|
def _buildChapterSectionsStructurePrompt(
|
|
self,
|
|
chapterId: str,
|
|
chapterLevel: int,
|
|
chapterTitle: str,
|
|
generationHint: str,
|
|
contentPartIds: List[str],
|
|
contentPartInstructions: Dict[str, Any],
|
|
contentParts: List[ContentPart],
|
|
userPrompt: str,
|
|
language: str = "en",
|
|
outputFormat: str = "txt"
|
|
) -> str:
|
|
"""Baue Prompt für Chapter-Sections-Struktur-Generierung, querying renderer for accepted section types."""
|
|
# Baue ContentParts-Index (nur IDs, keine Previews!)
|
|
contentPartsIndex = ""
|
|
for partId in contentPartIds:
|
|
part = self._findContentPartById(partId, contentParts)
|
|
if not part:
|
|
# Part not found - try to show info from chapter structure
|
|
partInfo = contentPartInstructions.get(partId, {})
|
|
if partInfo:
|
|
logger.warning(f"Chapter {chapterId}: ContentPart {partId} not found in contentParts list, but has chapter structure info.")
|
|
contentPartsIndex += f"\n- ContentPart ID: {partId}\n"
|
|
if "instruction" in partInfo:
|
|
contentPartsIndex += f" Instruction: {partInfo['instruction']}\n"
|
|
if "caption" in partInfo:
|
|
contentPartsIndex += f" Caption: {partInfo['caption']}\n"
|
|
contentPartsIndex += f" Note: ContentPart not found in contentParts list (ID may be from nested structure)\n"
|
|
continue
|
|
|
|
contentFormat = part.metadata.get("contentFormat", "unknown")
|
|
partInfo = contentPartInstructions.get(partId, {})
|
|
instruction = partInfo.get("instruction", "Use content as needed")
|
|
caption = partInfo.get("caption")
|
|
|
|
contentPartsIndex += f"\n- ContentPart ID: {partId}\n"
|
|
contentPartsIndex += f" Format: {contentFormat}\n"
|
|
contentPartsIndex += f" Type: {part.typeGroup}\n"
|
|
if instruction and instruction != "Use content as needed":
|
|
contentPartsIndex += f" Instruction: {instruction}\n"
|
|
if caption:
|
|
contentPartsIndex += f" Caption: {caption}\n"
|
|
|
|
if not contentPartsIndex:
|
|
contentPartsIndex = "\n(No content parts specified for this chapter)"
|
|
|
|
# Query renderer for accepted section types
|
|
acceptedSectionTypes = self._getAcceptedSectionTypesForFormat(outputFormat)
|
|
|
|
prompt = f"""TASK: Generate Chapter Sections Structure
|
|
|
|
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
|
|
|
|
CHAPTER: {chapterTitle} (Level {chapterLevel}, ID: {chapterId})
|
|
GENERATION HINT: {generationHint}
|
|
|
|
## CONTENT EFFICIENCY PRINCIPLES
|
|
- Generate COMPACT sections: Focus on essential information only
|
|
- AVOID creating too many sections - combine related content where possible
|
|
- Each section should serve a clear purpose with meaningful data
|
|
- If no relevant data exists for a topic, do NOT create a section for it
|
|
- Prefer ONE comprehensive section over multiple sparse sections
|
|
- HARD LIMIT: Maximum 5 sections per chapter. Combine related subtopics into single sections to stay within this limit.
|
|
|
|
**CRITICAL**: The chapter's generationHint above describes what content this chapter should generate. If the generationHint references documents/images/data, then EACH section that generates content for this chapter MUST assign the relevant ContentParts from AVAILABLE CONTENT PARTS below.
|
|
|
|
NOTE: Chapter already has a heading section. Do NOT generate a heading for the chapter title.
|
|
|
|
## SECTION INDEPENDENCE
|
|
- Each section is independent and self-contained
|
|
- One section does NOT have information about another section
|
|
- Each section must provide its own context and be understandable alone
|
|
|
|
AVAILABLE CONTENT PARTS:
|
|
{contentPartsIndex}
|
|
|
|
## CONTENT ASSIGNMENT RULE - CRITICAL
|
|
If AVAILABLE CONTENT PARTS are listed above, then EVERY section that generates content related to those ContentParts MUST assign them explicitly.
|
|
|
|
**Assignment logic:**
|
|
- If section generates text content ABOUT a ContentPart → assign "extracted" format ContentPart with appropriate instruction
|
|
- If section DISPLAYS a ContentPart → assign "object" format ContentPart
|
|
- If section's generationHint or purpose relates to a ContentPart listed above → it MUST have contentPartIds assigned
|
|
- If chapter's generationHint references documents/images/data AND section generates content for that chapter → section MUST assign relevant ContentParts
|
|
- Empty contentPartIds [] are only allowed if section generates content WITHOUT referencing any available ContentParts AND WITHOUT relating to chapter's generationHint
|
|
|
|
## ACCEPTED CONTENT TYPES FOR THIS FORMAT
|
|
The document output format ({outputFormat}) accepts only the following content types:
|
|
{', '.join(acceptedSectionTypes)}
|
|
|
|
**CRITICAL**: Only create sections with content types from this list. Other types will fail.
|
|
|
|
useAiCall RULE (simple):
|
|
- useAiCall: true → Content needs AI processing (extract, transform, generate, filter, summarize)
|
|
- useAiCall: false → Content can be inserted directly without changes (Format is "object" or "reference")
|
|
|
|
RETURN JSON:
|
|
{{
|
|
"sections": [
|
|
{{
|
|
"id": "section_1",
|
|
"content_type": "{acceptedSectionTypes[0]}",
|
|
"contentPartIds": ["extracted_part_id"],
|
|
"generationHint": "Description of what to extract or generate",
|
|
"useAiCall": true,
|
|
"elements": []
|
|
}}
|
|
]
|
|
}}
|
|
|
|
**MANDATORY CONTENT ASSIGNMENT CHECK:**
|
|
For each section, verify:
|
|
1. Are ContentParts listed in AVAILABLE CONTENT PARTS above?
|
|
2. Does this section's generationHint or purpose relate to those ContentParts?
|
|
3. If YES to both → section MUST have contentPartIds assigned (cannot be empty [])
|
|
4. Assign ContentPart IDs exactly as listed in AVAILABLE CONTENT PARTS above
|
|
|
|
IMAGE SECTIONS:
|
|
- For image sections, always provide a "caption" field with a descriptive caption for the image.
|
|
|
|
Return only valid JSON. Do not include any explanatory text outside the JSON.
|
|
"""
|
|
return prompt
|
|
|
|
def _getContentStructureExample(self, contentType: str) -> str:
|
|
"""Get the JSON structure example for a specific content type."""
|
|
structures = {
|
|
"table": '{{"headers": ["Column1", "Column2"], "rows": [["Value1", "Value2"], ["Value3", "Value4"]]}}',
|
|
"bullet_list": '{{"items": ["Item 1", "Item 2", "Item 3"]}}',
|
|
"heading": '{{"text": "Section Title", "level": 2}}',
|
|
"paragraph": '{{"text": "This is paragraph text."}}',
|
|
"code_block": '{{"code": "function example() {{ return true; }}", "language": "javascript"}}',
|
|
"image": '{{"base64Data": "<base64_encoded_image_data>", "altText": "Description", "caption": "Optional caption"}}'
|
|
}
|
|
return structures.get(contentType, '{{"text": ""}}')
|
|
|
|
def _buildSectionGenerationPrompt(
|
|
self,
|
|
section: Dict[str, Any],
|
|
contentParts: List[Optional[ContentPart]],
|
|
userPrompt: str,
|
|
generationHint: str,
|
|
allSections: Optional[List[Dict[str, Any]]] = None,
|
|
sectionIndex: Optional[int] = None,
|
|
isAggregation: bool = False,
|
|
language: str = "en",
|
|
outputFormat: str = "txt",
|
|
preExtractedText: Optional[str] = None
|
|
) -> tuple[str, str]:
|
|
"""Baue Prompt für Section-Generierung mit vollständigem Kontext."""
|
|
# Filtere None-Werte
|
|
validParts = [p for p in contentParts if p is not None]
|
|
|
|
# Section-Metadaten
|
|
sectionId = section.get("id", "unknown")
|
|
contentType = section.get("content_type", "paragraph")
|
|
|
|
# Baue ContentParts-Beschreibung
|
|
contentPartsText = ""
|
|
if isAggregation:
|
|
# Aggregation: ContentParts werden als Parameter übergeben, keine IDs im Prompt nötig
|
|
# Keine ContentPart-Beschreibung nötig - Daten sind bereits im Context verfügbar
|
|
contentPartsText = ""
|
|
else:
|
|
# Einzelverarbeitung: Zeige Previews
|
|
for part in validParts:
|
|
contentFormat = part.metadata.get("contentFormat", "unknown")
|
|
contentPartsText += f"\n- ContentPart {part.id}:\n"
|
|
contentPartsText += f" Format: {contentFormat}\n"
|
|
contentPartsText += f" Type: {part.typeGroup}\n"
|
|
if part.metadata.get("originalFileName"):
|
|
contentPartsText += f" Source file: {part.metadata.get('originalFileName')}\n"
|
|
|
|
if contentFormat == "extracted":
|
|
# CRITICAL: Check if this is binary/image data - NEVER include in text prompt!
|
|
isBinaryOrImage = (
|
|
part.typeGroup == "image" or
|
|
part.typeGroup == "binary" or
|
|
(part.mimeType and (
|
|
part.mimeType.startswith("image/") or
|
|
part.mimeType.startswith("video/") or
|
|
part.mimeType.startswith("audio/") or
|
|
self._isBinaryMimeType(part.mimeType)
|
|
)) or
|
|
# Heuristic check: if data looks like base64 (long string with base64 chars)
|
|
(part.data and isinstance(part.data, str) and
|
|
len(part.data) > 100 and
|
|
self._looksLikeBase64(part.data))
|
|
)
|
|
|
|
if isBinaryOrImage:
|
|
# NEVER include binary/base64 data in text prompt - security risk and token explosion!
|
|
dataLength = len(part.data) if part.data else 0
|
|
contentPartsText += f" Type: {part.typeGroup}\n"
|
|
contentPartsText += f" MIME type: {part.mimeType or 'unknown'}\n"
|
|
contentPartsText += f" Data size: {dataLength} chars (binary/base64 - not shown in prompt)\n"
|
|
if part.metadata.get("needsVisionExtraction"):
|
|
contentPartsText += f" Note: Will be processed with Vision AI\n"
|
|
if part.metadata.get("usageHint"):
|
|
contentPartsText += f" Usage hint: {part.metadata.get('usageHint')}\n"
|
|
else:
|
|
# Only for text data: Show preview
|
|
previewLength = 1000
|
|
if part.data:
|
|
preview = part.data[:previewLength] + "..." if len(part.data) > previewLength else part.data
|
|
contentPartsText += f" Content preview:\n```\n{preview}\n```\n"
|
|
else:
|
|
contentPartsText += f" Content: (empty)\n"
|
|
elif contentFormat == "reference":
|
|
contentPartsText += f" Reference: {part.metadata.get('documentReference')}\n"
|
|
if part.metadata.get("usageHint"):
|
|
contentPartsText += f" Usage hint: {part.metadata.get('usageHint')}\n"
|
|
elif contentFormat == "object":
|
|
dataLength = len(part.data) if part.data else 0
|
|
contentPartsText += f" Object type: {part.typeGroup}\n"
|
|
contentPartsText += f" MIME type: {part.mimeType}\n"
|
|
contentPartsText += f" Data size: {dataLength} chars (base64 encoded)\n"
|
|
if part.metadata.get("usageHint"):
|
|
contentPartsText += f" Usage hint: {part.metadata.get('usageHint')}\n"
|
|
|
|
# Baue Section-Kontext (vorherige und nachfolgende Sections)
|
|
contextText = ""
|
|
if allSections and sectionIndex is not None:
|
|
prevSections = []
|
|
nextSections = []
|
|
|
|
if sectionIndex > 0:
|
|
for i in range(max(0, sectionIndex - 2), sectionIndex):
|
|
prevSection = allSections[i]
|
|
prevSections.append({
|
|
"id": prevSection.get("id"),
|
|
"content_type": prevSection.get("content_type"),
|
|
"generation_hint": prevSection.get("generation_hint", "")[:100]
|
|
})
|
|
|
|
if sectionIndex < len(allSections) - 1:
|
|
for i in range(sectionIndex + 1, min(len(allSections), sectionIndex + 3)):
|
|
nextSection = allSections[i]
|
|
nextSections.append({
|
|
"id": nextSection.get("id"),
|
|
"content_type": nextSection.get("content_type"),
|
|
"generation_hint": nextSection.get("generation_hint", "")[:100]
|
|
})
|
|
|
|
if prevSections or nextSections:
|
|
contextText = "\n## DOCUMENT CONTEXT\n"
|
|
if prevSections:
|
|
contextText += "\nPrevious sections:\n"
|
|
for prev in prevSections:
|
|
contextText += f"- {prev['id']} ({prev['content_type']}): {prev['generation_hint']}\n"
|
|
if nextSections:
|
|
contextText += "\nFollowing sections:\n"
|
|
for next in nextSections:
|
|
contextText += f"- {next['id']} ({next['content_type']}): {next['generation_hint']}\n"
|
|
|
|
# Get accepted section types for the output format
|
|
acceptedTypesAggr = self._getAcceptedSectionTypesForFormat(outputFormat)
|
|
|
|
# CRITICAL: If the section's content_type is not supported by the output format,
|
|
# use the first accepted type instead. E.g., CSV only supports 'table', so
|
|
# even if section says 'code_block', we must output as 'table'.
|
|
effectiveContentType = contentType
|
|
if contentType not in acceptedTypesAggr and acceptedTypesAggr:
|
|
effectiveContentType = acceptedTypesAggr[0]
|
|
logger.debug(f"Section {sectionId}: Content type '{contentType}' not supported by format '{outputFormat}', using '{effectiveContentType}' instead")
|
|
|
|
contentStructureExample = self._getContentStructureExample(effectiveContentType)
|
|
|
|
# Build format note for the prompt - purely dynamic from renderer
|
|
# Always show what types are accepted for this format
|
|
formatNoteAggr = f"\n- Target Output Format: {outputFormat.upper()} (accepted content types: {', '.join(acceptedTypesAggr)})"
|
|
|
|
# Create template structure explicitly (not extracted from prompt)
|
|
# This ensures exact identity between initial and continuation prompts
|
|
templateStructure = f"""{{
|
|
"elements": [
|
|
{{
|
|
"type": "{effectiveContentType}",
|
|
"content": {contentStructureExample}
|
|
}}
|
|
]
|
|
}}"""
|
|
|
|
if isAggregation:
|
|
prompt = f"""# TASK: Generate Section Content (Aggregation)
|
|
|
|
Return only valid JSON. No explanatory text, no comments, no markdown formatting outside JSON.
|
|
If ContentParts have no data, return: {{"elements": [{{"type": "{effectiveContentType}", "content": {{"headers": [], "rows": []}}}}]}}
|
|
|
|
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
|
|
|
|
## SECTION METADATA
|
|
- Section ID: {sectionId}
|
|
- Content Type: {effectiveContentType}
|
|
- Generation Hint: {generationHint}{formatNoteAggr}
|
|
|
|
## CONTENT EFFICIENCY PRINCIPLES
|
|
- Generate COMPACT content: Focus on essential facts only
|
|
- AVOID verbose text, filler phrases, or redundant explanations
|
|
- Be CONCISE and direct - every word should add value
|
|
- NO introductory phrases like "This section describes..." or "Here we present..."
|
|
- Minimize output size for efficient processing
|
|
|
|
## INSTRUCTIONS
|
|
1. Extract all data from the context provided. Do not skip or omit any data.
|
|
2. Extract data only from the provided context. Never invent, create, or generate data that is not in the context.
|
|
3. If the context contains no data, return empty structures (empty rows array for tables).
|
|
4. Aggregate all data into one element (e.g., one table).
|
|
5. For table: Extract all rows from the context. Return {{"headers": [...], "rows": []}} only if no data exists.
|
|
6. Format based on content_type ({effectiveContentType}).
|
|
7. No HTML/styling: Plain text only, no markup.
|
|
8. Focus on the MOST RELEVANT information for this section's topic. Extract key facts, data, and findings. Omit redundant, repetitive, or tangential content.
|
|
|
|
|
|
## OUTPUT FORMAT
|
|
Return a JSON object with this structure:
|
|
|
|
{{
|
|
"elements": [
|
|
{{
|
|
"type": "{effectiveContentType}",
|
|
"content": {contentStructureExample}
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Output requirements:
|
|
- "content" must be an object (never a string)
|
|
- Return only valid JSON - no text before, no text after, no comments, no explanations
|
|
- No invented data: Return empty structures if ContentParts have no data
|
|
- Extract all data: Process every ContentPart completely and include all extracted data
|
|
|
|
## USER REQUEST (for context)
|
|
```
|
|
{userPrompt}
|
|
```
|
|
|
|
## CONTEXT
|
|
{contextText if contextText else ""}
|
|
"""
|
|
elif preExtractedText:
|
|
prompt = f"""# TASK: Generate Section Content from Pre-Extracted Data
|
|
|
|
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
|
|
|
|
## SECTION METADATA
|
|
- Section ID: {sectionId}
|
|
- Content Type: {effectiveContentType}
|
|
- Generation Hint: {generationHint}{formatNoteAggr}
|
|
|
|
## CONTENT EFFICIENCY PRINCIPLES
|
|
- Generate COMPACT content: Focus on essential facts only
|
|
- AVOID verbose text, filler phrases, or redundant explanations
|
|
- Be CONCISE and direct - every word should add value
|
|
- NO introductory phrases like "This section describes..." or "Here we present..."
|
|
- Minimize output size for efficient processing
|
|
|
|
## PRE-EXTRACTED CONTENT FOR THIS SECTION
|
|
```
|
|
{preExtractedText}
|
|
```
|
|
|
|
## INSTRUCTIONS
|
|
1. Use ONLY the pre-extracted content above. Never invent or generate data not present in it.
|
|
2. If the pre-extracted content is empty, return empty structures.
|
|
3. Format based on content_type ({effectiveContentType}).
|
|
4. Return only valid JSON with "elements" array.
|
|
5. No HTML/styling: Plain text only, no markup.
|
|
6. Focus on the MOST RELEVANT information. Be concise.
|
|
|
|
## OUTPUT FORMAT
|
|
Return a JSON object with this structure:
|
|
|
|
{{
|
|
"elements": [
|
|
{{
|
|
"type": "{effectiveContentType}",
|
|
"content": {contentStructureExample}
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Output requirements:
|
|
- "content" must be an object (never a string)
|
|
- Return only valid JSON - no text before, no text after, no comments, no explanations, no markdown code fences
|
|
- Start with {{ and end with }} - return ONLY the JSON object itself
|
|
- No invented data: Return empty structures if pre-extracted content is empty
|
|
|
|
## USER REQUEST
|
|
```
|
|
{userPrompt}
|
|
```
|
|
|
|
## CONTEXT
|
|
{contextText if contextText else ""}
|
|
"""
|
|
else:
|
|
# Determine if we have ContentParts or need to generate from scratch
|
|
hasContentParts = len(validParts) > 0
|
|
|
|
if hasContentParts:
|
|
# EXTRACT MODE: Extract data from provided ContentParts
|
|
prompt = f"""# TASK: Extract Section Content from Provided Data
|
|
|
|
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
|
|
|
|
## SECTION METADATA
|
|
- Section ID: {sectionId}
|
|
- Content Type: {effectiveContentType}
|
|
- Generation Hint: {generationHint}{formatNoteAggr}
|
|
|
|
## CONTENT EFFICIENCY PRINCIPLES
|
|
- Generate COMPACT content: Focus on essential facts only
|
|
- AVOID verbose text, filler phrases, or redundant explanations
|
|
- Be CONCISE and direct - every word should add value
|
|
- NO introductory phrases like "This section describes..." or "Here we present..."
|
|
- Minimize output size for efficient processing
|
|
|
|
## AVAILABLE CONTENT FOR THIS SECTION
|
|
{contentPartsText}
|
|
|
|
## INSTRUCTIONS
|
|
1. Extract data only from provided ContentParts. Never invent or generate data.
|
|
2. If ContentParts contain no data, return empty structures (empty rows array for tables).
|
|
3. Format based on content_type ({effectiveContentType}).
|
|
4. Return only valid JSON with "elements" array.
|
|
5. No HTML/styling: Plain text only, no markup.
|
|
6. Focus on the MOST RELEVANT information for this section's topic. Extract key facts, data, and findings. Omit redundant, repetitive, or tangential content.
|
|
|
|
## OUTPUT FORMAT
|
|
Return a JSON object with this structure:
|
|
|
|
{{
|
|
"elements": [
|
|
{{
|
|
"type": "{effectiveContentType}",
|
|
"content": {contentStructureExample}
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Output requirements:
|
|
- "content" must be an object (never a string)
|
|
- Return only valid JSON - no text before, no text after, no comments, no explanations, no markdown code fences
|
|
- Start with {{ and end with }} - return ONLY the JSON object itself
|
|
- No invented data: Return empty structures if ContentParts have no data
|
|
|
|
## USER REQUEST
|
|
```
|
|
{userPrompt}
|
|
```
|
|
|
|
## CONTEXT
|
|
{contextText if contextText else ""}
|
|
"""
|
|
else:
|
|
# GENERATE MODE: Generate content from scratch based on generationHint
|
|
prompt = f"""# TASK: Generate Section Content
|
|
|
|
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
|
|
|
|
## SECTION METADATA
|
|
- Section ID: {sectionId}
|
|
- Content Type: {effectiveContentType}
|
|
- Generation Hint: {generationHint}{formatNoteAggr}
|
|
|
|
## CONTENT EFFICIENCY PRINCIPLES
|
|
- Generate COMPACT content: Focus on essential facts only
|
|
- AVOID verbose text, filler phrases, or redundant explanations
|
|
- Be CONCISE and direct - every word should add value
|
|
- NO introductory phrases like "This section describes..." or "Here we present..."
|
|
- Minimize output size for efficient processing
|
|
|
|
## INSTRUCTIONS
|
|
1. Generate content based on the Generation Hint above.
|
|
2. Create appropriate content that matches the content_type ({effectiveContentType}).
|
|
3. The content should be relevant to the USER REQUEST and fit the context of surrounding sections.
|
|
4. Return only valid JSON with "elements" array.
|
|
5. No HTML/styling: Plain text only, no markup.
|
|
6. Keep content CONCISE - focus on substance, not length.
|
|
|
|
## OUTPUT FORMAT
|
|
Return a JSON object with this structure:
|
|
|
|
{{
|
|
"elements": [
|
|
{{
|
|
"type": "{effectiveContentType}",
|
|
"content": {contentStructureExample}
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Output requirements:
|
|
- "content" must be an object (never a string)
|
|
- Return only valid JSON - no text before, no text after, no comments, no explanations, no markdown code fences
|
|
- Start with {{ and end with }} - return ONLY the JSON object itself
|
|
- Generate meaningful content based on the Generation Hint
|
|
|
|
## USER REQUEST
|
|
```
|
|
{userPrompt}
|
|
```
|
|
|
|
## CONTEXT
|
|
{contextText if contextText else ""}
|
|
"""
|
|
return prompt, templateStructure
|
|
|
|
async def buildSectionPromptWithContinuation(
|
|
self,
|
|
continuationContext: Any,
|
|
templateStructure: str,
|
|
basePrompt: str
|
|
) -> str:
|
|
"""Build section prompt with continuation context. Uses unified signature.
|
|
|
|
Single unified implementation for all section content generation contexts.
|
|
|
|
Note: All initial context (section, contentParts, userPrompt, etc.) is already
|
|
contained in basePrompt. This function only adds continuation-specific instructions.
|
|
"""
|
|
# Extract continuation context fields (only what's needed for continuation)
|
|
incompletePart = continuationContext.incomplete_part
|
|
lastRawJson = continuationContext.last_raw_json
|
|
|
|
# Generate both overlap context and hierarchy context using jsonContinuation
|
|
overlapContext = ""
|
|
unifiedContext = ""
|
|
if lastRawJson:
|
|
# Get contexts directly from jsonContinuation
|
|
from modules.shared.jsonContinuation import getContexts
|
|
contexts = getContexts(lastRawJson)
|
|
overlapContext = contexts.overlapContext
|
|
unifiedContext = contexts.hierarchyContextForPrompt
|
|
elif incompletePart:
|
|
unifiedContext = incompletePart
|
|
else:
|
|
unifiedContext = "Unable to extract context - response was completely broken"
|
|
|
|
# Build unified continuation prompt format
|
|
continuationPrompt = f"""{basePrompt}
|
|
|
|
--- CONTINUATION REQUEST ---
|
|
The previous JSON response was incomplete. Continue from where it stopped.
|
|
|
|
Context showing structure hierarchy with cut point:
|
|
```
|
|
{unifiedContext}
|
|
```
|
|
|
|
Overlap Requirement:
|
|
To ensure proper merging, your response MUST start EXACTLY with the overlap context shown below, then continue with new content.
|
|
|
|
Overlap context (start your response with this exact text):
|
|
```json
|
|
{overlapContext if overlapContext else "No overlap context available"}
|
|
```
|
|
|
|
TASK:
|
|
1. Start your response EXACTLY with the overlap context shown above (character by character)
|
|
2. Continue seamlessly from where the overlap context ends
|
|
3. Complete the remaining content following the JSON structure template above
|
|
4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
|
|
|
|
CRITICAL:
|
|
- Your response MUST begin with the exact overlap context text (this enables automatic merging)
|
|
- Continue seamlessly after the overlap context with new content
|
|
- Your response must be valid JSON matching the structure template above"""
|
|
return continuationPrompt
|
|
|
|
def _extractAndMergeMultipleJsonBlocks(self, responseText: str, contentType: str, sectionId: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extract multiple JSON blocks from response and merge them appropriately.
|
|
For tables: Merge all rows into a single table.
|
|
For other types: Combine elements.
|
|
"""
|
|
from modules.shared.jsonUtils import tryParseJson, stripCodeFences, normalizeJsonText, extractFirstBalancedJson
|
|
|
|
# Extract all JSON blocks, handling both --- separators and multiple ```json blocks
|
|
blocks = []
|
|
|
|
# Strategy: Extract all ```json blocks first (most reliable), then fall back to other methods
|
|
# This handles cases where --- separators and ```json blocks are mixed
|
|
if "```json" in responseText:
|
|
# Extract all ```json blocks regardless of --- separators
|
|
jsonParts = responseText.split("```json")
|
|
for jsonPart in jsonParts[1:]: # Skip first empty part
|
|
jsonPart = "```json" + jsonPart
|
|
# Extract just the JSON block (until closing ```)
|
|
closingFence = jsonPart.find("```", 7) # Find closing ``` after "```json"
|
|
if closingFence != -1:
|
|
jsonPart = jsonPart[:closingFence + 3]
|
|
jsonPart = jsonPart.strip()
|
|
if jsonPart:
|
|
blocks.append(jsonPart)
|
|
|
|
# If no ```json blocks found, try splitting by --- and extracting JSON
|
|
if not blocks and "---" in responseText:
|
|
parts = responseText.split("---")
|
|
for part in parts:
|
|
part = part.strip()
|
|
if not part:
|
|
continue
|
|
|
|
# Try to extract JSON directly from this part
|
|
normalized = normalizeJsonText(part)
|
|
normalized = stripCodeFences(normalized)
|
|
jsonBlock = extractFirstBalancedJson(normalized)
|
|
if jsonBlock:
|
|
blocks.append(jsonBlock)
|
|
elif responseText.count("```json") > 1:
|
|
# Split by ```json markers (no --- separator)
|
|
parts = responseText.split("```json")
|
|
for part in parts[1:]: # Skip first empty part
|
|
part = "```json" + part
|
|
part = part.strip()
|
|
if part:
|
|
blocks.append(part)
|
|
else:
|
|
# Try to find multiple JSON objects/arrays directly
|
|
normalized = normalizeJsonText(responseText)
|
|
normalized = stripCodeFences(normalized)
|
|
|
|
# Find all JSON blocks
|
|
start = 0
|
|
while start < len(normalized):
|
|
# Find next JSON start
|
|
brace = normalized.find('{', start)
|
|
bracket = normalized.find('[', start)
|
|
jsonStart = -1
|
|
if brace != -1 and (bracket == -1 or brace < bracket):
|
|
jsonStart = brace
|
|
elif bracket != -1:
|
|
jsonStart = bracket
|
|
|
|
if jsonStart == -1:
|
|
break
|
|
|
|
# Extract balanced JSON
|
|
jsonBlock = extractFirstBalancedJson(normalized[jsonStart:])
|
|
if jsonBlock:
|
|
blocks.append(jsonBlock)
|
|
start = jsonStart + len(jsonBlock)
|
|
else:
|
|
break
|
|
|
|
if not blocks:
|
|
logger.warning(f"Section {sectionId}: Could not extract multiple JSON blocks")
|
|
return []
|
|
|
|
logger.info(f"Section {sectionId}: Extracted {len(blocks)} JSON blocks, merging for contentType={contentType}")
|
|
|
|
# Parse all blocks
|
|
allElements = []
|
|
for i, block in enumerate(blocks):
|
|
parsed, parseError, _ = tryParseJson(block)
|
|
if parseError:
|
|
logger.warning(f"Section {sectionId}: Failed to parse JSON block {i+1}: {str(parseError)}")
|
|
continue
|
|
|
|
elementsFromBlock = []
|
|
if isinstance(parsed, dict):
|
|
if "elements" in parsed:
|
|
elementsFromBlock = parsed["elements"]
|
|
allElements.extend(elementsFromBlock)
|
|
elif parsed.get("type"):
|
|
elementsFromBlock = [parsed]
|
|
allElements.append(parsed)
|
|
elif isinstance(parsed, list):
|
|
elementsFromBlock = parsed
|
|
allElements.extend(parsed)
|
|
|
|
# Log row count for table elements
|
|
if contentType == "table":
|
|
tableCount = sum(1 for e in elementsFromBlock if isinstance(e, dict) and e.get("type") == "table")
|
|
rowCount = sum(
|
|
len(e.get("content", {}).get("rows", []))
|
|
for e in elementsFromBlock
|
|
if isinstance(e, dict) and e.get("type") == "table"
|
|
)
|
|
if tableCount > 0:
|
|
logger.info(f"Section {sectionId}: JSON block {i+1}: {tableCount} table(s) with {rowCount} total rows")
|
|
|
|
# Merge elements based on contentType
|
|
if contentType == "table" and len(allElements) > 1:
|
|
# Find all table elements
|
|
tableElements = [e for e in allElements if isinstance(e, dict) and e.get("type") == "table"]
|
|
if len(tableElements) > 1:
|
|
# Check if tables can be merged (same column counts)
|
|
canMerge = self._canMergeTables(tableElements)
|
|
if canMerge:
|
|
logger.info(f"Section {sectionId}: Merging {len(tableElements)} tables into one")
|
|
mergedTable = self._mergeTableElements(tableElements)
|
|
# Replace all table elements with merged one
|
|
nonTableElements = [e for e in allElements if not (isinstance(e, dict) and e.get("type") == "table")]
|
|
return [mergedTable] + nonTableElements
|
|
else:
|
|
logger.warning(f"Section {sectionId}: Cannot merge {len(tableElements)} tables (incompatible headers/columns). Keeping tables separate.")
|
|
# Return all elements as-is (tables remain separate)
|
|
return allElements
|
|
|
|
return allElements
|
|
|
|
def _canMergeTables(self, tableElements: List[Dict[str, Any]]) -> bool:
|
|
"""Check if tables can be safely merged (same column counts)."""
|
|
if len(tableElements) <= 1:
|
|
return True
|
|
|
|
# Extract column counts from all tables
|
|
columnCounts = []
|
|
for table in tableElements:
|
|
headers = []
|
|
if isinstance(table.get("content"), dict):
|
|
headers = table["content"].get("headers", [])
|
|
elif isinstance(table.get("content"), list):
|
|
# Old format: content is list of rows
|
|
if table["content"] and isinstance(table["content"][0], list):
|
|
headers = table["content"][0]
|
|
columnCounts.append(len(headers))
|
|
|
|
# Check if all tables have the same column count
|
|
firstCount = columnCounts[0] if columnCounts else 0
|
|
return all(count == firstCount for count in columnCounts)
|
|
|
|
def _mergeTableElements(self, tableElements: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Merge multiple table elements into a single table.
|
|
Assumes tables have compatible column counts (checked by _canMergeTables).
|
|
"""
|
|
if not tableElements:
|
|
return {"type": "table", "content": {"headers": [], "rows": []}}
|
|
|
|
if len(tableElements) == 1:
|
|
return tableElements[0]
|
|
|
|
# Extract headers from all tables
|
|
allHeaders = []
|
|
for table in tableElements:
|
|
headers = []
|
|
if isinstance(table.get("content"), dict):
|
|
headers = table["content"].get("headers", [])
|
|
elif isinstance(table.get("content"), list):
|
|
# Old format: content is list of rows
|
|
if table["content"] and isinstance(table["content"][0], list):
|
|
headers = table["content"][0]
|
|
allHeaders.append(headers)
|
|
|
|
# Check header compatibility (same headers or just same column count)
|
|
firstHeaders = allHeaders[0]
|
|
headersCompatible = all(headers == firstHeaders for headers in allHeaders)
|
|
|
|
# If headers differ but column counts match, use first table's headers and log warning
|
|
if not headersCompatible:
|
|
logger.warning(f"Merging {len(tableElements)} tables with different headers but same column count. Using headers from first table.")
|
|
|
|
# Use headers from first table
|
|
headers = firstHeaders
|
|
|
|
# Collect all rows from all tables, validating column count
|
|
allRows = []
|
|
for tableIdx, table in enumerate(tableElements):
|
|
rows = []
|
|
if isinstance(table.get("content"), dict):
|
|
rows = table["content"].get("rows", [])
|
|
elif isinstance(table.get("content"), list):
|
|
# Old format: content is list of rows
|
|
if table["content"] and isinstance(table["content"][0], list):
|
|
rows = table["content"][1:] if len(table["content"]) > 1 else []
|
|
|
|
# Validate row column count matches header count
|
|
expectedColCount = len(headers)
|
|
validRows = []
|
|
for rowIdx, row in enumerate(rows):
|
|
if isinstance(row, list):
|
|
if len(row) == expectedColCount:
|
|
validRows.append(row)
|
|
else:
|
|
logger.warning(f"Table {tableIdx+1}, row {rowIdx+1}: column count mismatch ({len(row)} vs {expectedColCount}), skipping row")
|
|
elif isinstance(row, dict):
|
|
# Convert dict row to list based on header order
|
|
rowList = [row.get(h, "") for h in headers]
|
|
validRows.append(rowList)
|
|
else:
|
|
logger.warning(f"Table {tableIdx+1}, row {rowIdx+1}: invalid row format, skipping")
|
|
|
|
allRows.extend(validRows)
|
|
|
|
# Keep all rows, including duplicates (duplicates may be intentional)
|
|
logger.info(f"Merged {len(tableElements)} tables: {len(allRows)} total rows (duplicates preserved)")
|
|
|
|
return {
|
|
"type": "table",
|
|
"content": {
|
|
"headers": headers,
|
|
"rows": allRows
|
|
}
|
|
}
|
|
|
|
def _isBinaryMimeType(self, mimeType: str) -> bool:
|
|
"""Check if MIME type is binary."""
|
|
binaryTypes = [
|
|
"application/octet-stream",
|
|
"application/pdf",
|
|
"application/zip",
|
|
"application/x-zip-compressed"
|
|
]
|
|
return mimeType in binaryTypes
|
|
|
|
def _looksLikeBase64(self, data: str) -> bool:
|
|
"""
|
|
Heuristic check if string looks like base64-encoded data.
|
|
|
|
Base64 contains only: A-Z, a-z, 0-9, +, /, =, and whitespace.
|
|
If >95% of characters are base64 chars and no normal text patterns, likely base64.
|
|
"""
|
|
if not data or len(data) < 100:
|
|
return False
|
|
|
|
base64Chars = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=\n\r\t ")
|
|
sample = data[:500] # Check first 500 chars
|
|
if not sample:
|
|
return False
|
|
|
|
base64Ratio = sum(1 for c in sample if c in base64Chars) / len(sample)
|
|
|
|
# If >95% base64 chars and no normal text patterns (like spaces between words) → likely base64
|
|
# Base64 typically has very long strings without spaces or punctuation
|
|
hasNormalTextPatterns = any(
|
|
c in sample[:200] for c in ".,!?;:()[]{}\"'"
|
|
) or " " in sample[:200] # Double spaces suggest text
|
|
|
|
return base64Ratio > 0.95 and not hasNormalTextPatterns
|
|
|
|
def _findContentPartById(self, partId: str, contentParts: List[ContentPart]) -> Optional[ContentPart]:
|
|
"""Finde ContentPart nach ID."""
|
|
for part in contentParts:
|
|
if part.id == partId:
|
|
return part
|
|
return None
|
|
|
|
def _needsAggregation(
|
|
self,
|
|
contentType: str,
|
|
contentPartCount: int
|
|
) -> bool:
|
|
"""
|
|
Bestimmt ob mehrere ContentParts aggregiert werden müssen.
|
|
|
|
Aggregation nötig wenn:
|
|
- content_type erfordert Aggregation (table, bullet_list)
|
|
- UND mehrere ContentParts vorhanden sind (> 1)
|
|
|
|
Args:
|
|
contentType: Section content_type
|
|
contentPartCount: Anzahl der ContentParts in dieser Section
|
|
|
|
Returns:
|
|
True wenn Aggregation nötig, False sonst
|
|
"""
|
|
aggregationTypes = ["table", "bullet_list"]
|
|
|
|
if contentType in aggregationTypes and contentPartCount > 1:
|
|
return True
|
|
|
|
# Optional: Auch für paragraph wenn mehrere Parts vorhanden
|
|
# (z.B. Vergleich mehrerer Dokumente)
|
|
# Standard: Keine Aggregation für paragraph
|
|
return False
|
|
|
|
def _getAcceptedSectionTypesForFormat(self, outputFormat: str) -> List[str]:
|
|
"""
|
|
Get accepted section types for a given output format by querying the renderer.
|
|
|
|
Args:
|
|
outputFormat: Format name (e.g., 'csv', 'json', 'pdf')
|
|
|
|
Returns:
|
|
List of accepted section content types (e.g., ["table", "code_block"])
|
|
|
|
Raises:
|
|
ValueError: If renderer not found or doesn't provide accepted types
|
|
"""
|
|
from modules.serviceCenter.services.serviceGeneration.renderers.registry import getRenderer
|
|
|
|
# Get document renderer for this format (structure filling is document generation path)
|
|
renderer = getRenderer(outputFormat, self.services, outputStyle='document')
|
|
|
|
if not renderer:
|
|
raise ValueError(f"No renderer found for output format '{outputFormat}'. Check renderer registry.")
|
|
|
|
if not hasattr(renderer, 'getAcceptedSectionTypes'):
|
|
raise ValueError(f"Renderer for '{outputFormat}' does not implement getAcceptedSectionTypes(). Add this method to the renderer.")
|
|
|
|
acceptedTypes = renderer.getAcceptedSectionTypes(outputFormat)
|
|
|
|
if not acceptedTypes:
|
|
raise ValueError(f"Renderer for '{outputFormat}' returned empty accepted types. Fix getAcceptedSectionTypes() in the renderer.")
|
|
|
|
logger.debug(f"Renderer for '{outputFormat}' accepts: {acceptedTypes}")
|
|
return acceptedTypes
|
|
|