fixed generation issue and ai calls only for extracted content

This commit is contained in:
ValueOn AG 2025-12-28 13:51:19 +01:00
parent 723f98ea7a
commit db456f1667
4 changed files with 138 additions and 95 deletions

View file

@ -134,23 +134,13 @@ class StructureFiller:
userPrompt=userPrompt userPrompt=userPrompt
) )
# Debug: Log Prompt # AI-Call für Chapter-Struktur-Generierung
self.services.utils.writeDebugFile( # Note: Debug logging is handled by callAiPlanning
chapterPrompt,
f"chapter_structure_{chapterId}_prompt"
)
aiResponse = await self.aiService.callAiPlanning( aiResponse = await self.aiService.callAiPlanning(
prompt=chapterPrompt, prompt=chapterPrompt,
debugType=f"chapter_structure_{chapterId}" debugType=f"chapter_structure_{chapterId}"
) )
# Debug: Log Response
self.services.utils.writeDebugFile(
aiResponse,
f"chapter_structure_{chapterId}_response"
)
sectionsStructure = json.loads( sectionsStructure = json.loads(
self.services.utils.jsonExtractString(aiResponse) self.services.utils.jsonExtractString(aiResponse)
) )
@ -158,20 +148,39 @@ class StructureFiller:
chapter["sections"] = sectionsStructure.get("sections", []) chapter["sections"] = sectionsStructure.get("sections", [])
# Setze useAiCall Flag (falls nicht von AI gesetzt) # Setze useAiCall Flag (falls nicht von AI gesetzt)
# WICHTIG: useAiCall kann nur true sein, wenn mindestens ein ContentPart Format "extracted" hat!
# "object" und "reference" Formate werden direkt als Elemente hinzugefügt, benötigen kein AI.
for section in chapter["sections"]: for section in chapter["sections"]:
if "useAiCall" not in section: if "useAiCall" not in section:
contentType = section.get("content_type", "paragraph") contentType = section.get("content_type", "paragraph")
useAiCall = contentType != "paragraph" contentPartIds = section.get("contentPartIds", [])
# Prüfe contentPartInstructions # Prüfe ob mindestens ein ContentPart Format "extracted" hat
if not useAiCall: hasExtractedPart = False
for partId in section.get("contentPartIds", []): for partId in contentPartIds:
instruction = contentPartInstructions.get(partId, {}).get("instruction", "") part = self._findContentPartById(partId, contentParts)
if instruction and instruction.lower() not in ["include full text", "include all content", "use full extracted text"]: if part:
useAiCall = True contentFormat = part.metadata.get("contentFormat", "unknown")
if contentFormat == "extracted":
hasExtractedPart = True
break break
# useAiCall kann nur true sein, wenn extracted Parts vorhanden sind
useAiCall = False
if hasExtractedPart:
# Prüfe ob Transformation nötig ist
useAiCall = contentType != "paragraph"
# Prüfe contentPartInstructions für Transformation
if not useAiCall:
for partId in contentPartIds:
instruction = contentPartInstructions.get(partId, {}).get("instruction", "")
if instruction and instruction.lower() not in ["include full text", "include all content", "use full extracted text"]:
useAiCall = True
break
section["useAiCall"] = useAiCall section["useAiCall"] = useAiCall
logger.debug(f"Section {section.get('id')}: useAiCall={useAiCall} (hasExtractedPart={hasExtractedPart}, contentType={contentType})")
return chapterStructure return chapterStructure
@ -200,10 +209,16 @@ class StructureFiller:
sectionId = section.get("id") sectionId = section.get("id")
contentPartIds = section.get("contentPartIds", []) contentPartIds = section.get("contentPartIds", [])
contentFormats = section.get("contentFormats", {}) contentFormats = section.get("contentFormats", {})
generationHint = section.get("generation_hint") # Check both camelCase and snake_case for generationHint
generationHint = section.get("generationHint") or section.get("generation_hint")
contentType = section.get("content_type", "paragraph") contentType = section.get("content_type", "paragraph")
useAiCall = section.get("useAiCall", False) useAiCall = section.get("useAiCall", False)
# WICHTIG: Wenn keine ContentParts vorhanden sind, kann kein AI-Call gemacht werden
if len(contentPartIds) == 0:
useAiCall = False
logger.debug(f"Section {sectionId}: No content parts, setting useAiCall=False")
elements = [] elements = []
# Prüfe ob Aggregation nötig ist # Prüfe ob Aggregation nötig ist
@ -212,6 +227,8 @@ class StructureFiller:
contentPartCount=len(contentPartIds) contentPartCount=len(contentPartIds)
) )
logger.info(f"Processing section {sectionId}: contentType={contentType}, contentPartCount={len(contentPartIds)}, useAiCall={useAiCall}, needsAggregation={needsAggregation}, hasGenerationHint={bool(generationHint)}")
if needsAggregation and useAiCall: if needsAggregation and useAiCall:
# Aggregation: Alle Parts zusammen verarbeiten # Aggregation: Alle Parts zusammen verarbeiten
sectionParts = [ sectionParts = [
@ -251,6 +268,7 @@ class StructureFiller:
# Aggregiere extracted Parts mit AI # Aggregiere extracted Parts mit AI
if extractedParts: if extractedParts:
logger.debug(f"Section {sectionId}: Aggregating {len(extractedParts)} extracted parts with AI")
generationPrompt = self._buildSectionGenerationPrompt( generationPrompt = self._buildSectionGenerationPrompt(
section=section, section=section,
contentParts=extractedParts, # ALLE PARTS für Aggregation! contentParts=extractedParts, # ALLE PARTS für Aggregation!
@ -279,6 +297,7 @@ class StructureFiller:
generationPrompt, generationPrompt,
f"section_content_{sectionId}_prompt" f"section_content_{sectionId}_prompt"
) )
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt (aggregation)")
# Verwende callAi für ContentParts-Unterstützung (nicht callAiPlanning!) # Verwende callAi für ContentParts-Unterstützung (nicht callAiPlanning!)
request = AiCallRequest( request = AiCallRequest(
@ -297,6 +316,7 @@ class StructureFiller:
aiResponse.content, aiResponse.content,
f"section_content_{sectionId}_response" f"section_content_{sectionId}_response"
) )
logger.debug(f"Logged section response: section_content_{sectionId}_response (aggregation)")
# Parse und füge zu elements hinzu # Parse und füge zu elements hinzu
generatedElements = json.loads( generatedElements = json.loads(
@ -348,8 +368,10 @@ class StructureFiller:
}) })
elif contentFormat == "extracted": elif contentFormat == "extracted":
if generationHint: # WICHTIG: Prüfe sowohl useAiCall als auch generationHint
if useAiCall and generationHint:
# AI-Call mit einzelnen ContentPart # AI-Call mit einzelnen ContentPart
logger.debug(f"Processing section {sectionId}: Single extracted part with AI call (useAiCall={useAiCall}, generationHint={bool(generationHint)})")
generationPrompt = self._buildSectionGenerationPrompt( generationPrompt = self._buildSectionGenerationPrompt(
section=section, section=section,
contentParts=[part], # EIN PART contentParts=[part], # EIN PART
@ -378,6 +400,7 @@ class StructureFiller:
generationPrompt, generationPrompt,
f"section_content_{sectionId}_prompt" f"section_content_{sectionId}_prompt"
) )
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt")
# Verwende callAi für ContentParts-Unterstützung # Verwende callAi für ContentParts-Unterstützung
request = AiCallRequest( request = AiCallRequest(
@ -396,6 +419,7 @@ class StructureFiller:
aiResponse.content, aiResponse.content,
f"section_content_{sectionId}_response" f"section_content_{sectionId}_response"
) )
logger.debug(f"Logged section response: section_content_{sectionId}_response")
# Parse und füge zu elements hinzu # Parse und füge zu elements hinzu
generatedElements = json.loads( generatedElements = json.loads(
@ -421,6 +445,7 @@ class StructureFiller:
# NICHT raise - Section wird mit Fehlermeldung gerendert # NICHT raise - Section wird mit Fehlermeldung gerendert
else: else:
# Füge extrahierten Text direkt hinzu (kein AI-Call) # Füge extrahierten Text direkt hinzu (kein AI-Call)
logger.debug(f"Processing section {sectionId}: Single extracted part WITHOUT AI call (useAiCall={useAiCall}, generationHint={bool(generationHint)}) - adding extracted text directly")
elements.append({ elements.append({
"type": "extracted_text", "type": "extracted_text",
"content": part.data, "content": part.data,
@ -566,8 +591,15 @@ class StructureFiller:
sectionId = section.get("id") sectionId = section.get("id")
contentPartIds = section.get("contentPartIds", []) contentPartIds = section.get("contentPartIds", [])
contentFormats = section.get("contentFormats", {}) contentFormats = section.get("contentFormats", {})
generationHint = section.get("generation_hint") # Check both camelCase and snake_case for generationHint
generationHint = section.get("generationHint") or section.get("generation_hint")
contentType = section.get("content_type", "paragraph") contentType = section.get("content_type", "paragraph")
useAiCall = section.get("useAiCall", False)
# WICHTIG: Wenn keine ContentParts vorhanden sind, kann kein AI-Call gemacht werden
if len(contentPartIds) == 0:
useAiCall = False
logger.debug(f"Section {sectionId} (legacy): No content parts, setting useAiCall=False")
elements = [] elements = []
@ -577,7 +609,9 @@ class StructureFiller:
contentPartCount=len(contentPartIds) contentPartCount=len(contentPartIds)
) )
if needsAggregation and generationHint: logger.info(f"Processing section {sectionId} (legacy): contentType={contentType}, contentPartCount={len(contentPartIds)}, useAiCall={useAiCall}, needsAggregation={needsAggregation}, hasGenerationHint={bool(generationHint)}")
if needsAggregation and useAiCall and generationHint:
# Aggregation: Alle Parts zusammen verarbeiten # Aggregation: Alle Parts zusammen verarbeiten
sectionParts = [ sectionParts = [
self._findContentPartById(pid, contentParts) self._findContentPartById(pid, contentParts)
@ -702,8 +736,10 @@ class StructureFiller:
}) })
elif contentFormat == "extracted": elif contentFormat == "extracted":
if generationHint: # WICHTIG: Prüfe sowohl useAiCall als auch generationHint
if useAiCall and generationHint:
# AI-Call mit einzelnen ContentPart # AI-Call mit einzelnen ContentPart
logger.debug(f"Processing section {sectionId}: Single extracted part with AI call (useAiCall={useAiCall}, generationHint={bool(generationHint)})")
generationPrompt = self._buildSectionGenerationPrompt( generationPrompt = self._buildSectionGenerationPrompt(
section=section, section=section,
contentParts=[part], contentParts=[part],
@ -729,6 +765,7 @@ class StructureFiller:
generationPrompt, generationPrompt,
f"section_content_{sectionId}_prompt" f"section_content_{sectionId}_prompt"
) )
logger.debug(f"Logged section prompt: section_content_{sectionId}_prompt")
request = AiCallRequest( request = AiCallRequest(
prompt=generationPrompt, prompt=generationPrompt,
@ -745,6 +782,7 @@ class StructureFiller:
aiResponse.content, aiResponse.content,
f"section_content_{sectionId}_response" f"section_content_{sectionId}_response"
) )
logger.debug(f"Logged section response: section_content_{sectionId}_response")
generatedElements = json.loads( generatedElements = json.loads(
self.services.utils.jsonExtractString(aiResponse.content) self.services.utils.jsonExtractString(aiResponse.content)
@ -765,6 +803,8 @@ class StructureFiller:
}) })
logger.error(f"Error generating section {sectionId}: {str(e)}") logger.error(f"Error generating section {sectionId}: {str(e)}")
else: else:
# Füge extrahierten Text direkt hinzu (kein AI-Call)
logger.debug(f"Processing section {sectionId}: Single extracted part WITHOUT AI call (useAiCall={useAiCall}, generationHint={bool(generationHint)}) - adding extracted text directly")
elements.append({ elements.append({
"type": "extracted_text", "type": "extracted_text",
"content": part.data, "content": part.data,
@ -817,35 +857,44 @@ class StructureFiller:
prompt = f"""TASK: Generate Chapter Sections Structure prompt = f"""TASK: Generate Chapter Sections Structure
CHAPTER METADATA: CHAPTER: {chapterTitle} (Level {chapterLevel}, ID: {chapterId})
- Chapter ID: {chapterId} GENERATION HINT: {generationHint}
- Chapter Level: {chapterLevel}
- Chapter Title: {chapterTitle}
- Generation Hint: {generationHint}
WICHTIG: Chapter hat bereits vordefinierte Heading-Section. NOTE: Chapter already has a heading section. Do NOT generate a heading for the chapter title.
Generiere NICHT eine Heading-Section für Chapter-Title!
AVAILABLE CONTENT PARTS: AVAILABLE CONTENT PARTS:
{contentPartsIndex} {contentPartsIndex}
STANDARD JSON SCHEMA FOR SECTIONS: CONTENT TYPES: table, bullet_list, heading, paragraph, code_block, image
Supported content_types: table, bullet_list, heading, paragraph, code_block, image
Return JSON: useAiCall RULES:
- useAiCall: true ONLY if ContentPart Format is "extracted" AND transformation needed
- useAiCall: false if Format is "object" or "reference" (direct insertion)
- useAiCall: false if Format is "extracted" AND simple "include full text" instruction
RETURN JSON:
{{ {{
"sections": [ "sections": [
{{ {{
"id": "section_1", "id": "section_1",
"content_type": "paragraph", "content_type": "paragraph",
"contentPartIds": ["part_ext_1"], "contentPartIds": ["extracted_part_1"],
"generationHint": "...", "generationHint": "Include full text",
"useAiCall": false, "useAiCall": false,
"elements": [] "elements": []
}} }}
] ]
}} }}
EXAMPLES (all content types):
- paragraph: {{"id": "s1", "content_type": "paragraph", "contentPartIds": ["extracted_1"], "generationHint": "Include full text", "useAiCall": false, "elements": []}}
- bullet_list: {{"id": "s2", "content_type": "bullet_list", "contentPartIds": ["extracted_1"], "generationHint": "Create bullet list", "useAiCall": true, "elements": []}}
- table: {{"id": "s3", "content_type": "table", "contentPartIds": ["extracted_1", "extracted_2"], "generationHint": "Create table", "useAiCall": true, "elements": []}}
- heading: {{"id": "s4", "content_type": "heading", "contentPartIds": ["extracted_1"], "generationHint": "Extract heading", "useAiCall": true, "elements": []}}
- code_block: {{"id": "s5", "content_type": "code_block", "contentPartIds": ["extracted_1"], "generationHint": "Format code", "useAiCall": true, "elements": []}}
- image: {{"id": "s6", "content_type": "image", "contentPartIds": ["obj_1"], "generationHint": "Display image", "useAiCall": false, "elements": []}}
- reference: {{"id": "s7", "content_type": "paragraph", "contentPartIds": ["ref_1"], "generationHint": "Reference", "useAiCall": false, "elements": []}}
CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON. CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
""" """
return prompt return prompt

View file

@ -68,24 +68,13 @@ class StructureGenerator:
outputFormat=outputFormat outputFormat=outputFormat
) )
# Debug: Log Prompt
self.services.utils.writeDebugFile(
structurePrompt,
"chapter_structure_generation_prompt"
)
# AI-Call für Chapter-Struktur-Generierung # AI-Call für Chapter-Struktur-Generierung
# Note: Debug logging is handled by callAiPlanning
aiResponse = await self.aiService.callAiPlanning( aiResponse = await self.aiService.callAiPlanning(
prompt=structurePrompt, prompt=structurePrompt,
debugType="chapter_structure_generation" debugType="chapter_structure_generation"
) )
# Debug: Log Response
self.services.utils.writeDebugFile(
aiResponse,
"chapter_structure_generation_response"
)
# Parse Struktur # Parse Struktur
structure = json.loads(self.services.utils.jsonExtractString(aiResponse)) structure = json.loads(self.services.utils.jsonExtractString(aiResponse))
@ -143,34 +132,6 @@ class StructureGenerator:
# Baue Index nur für gültige Parts # Baue Index nur für gültige Parts
for i, part in enumerate(validParts, 1): for i, part in enumerate(validParts, 1):
contentFormat = part.metadata.get("contentFormat", "unknown") contentFormat = part.metadata.get("contentFormat", "unknown")
dataPreview = ""
if contentFormat == "extracted":
# Für Image-Parts: Zeige dass es ein Image ist
if part.typeGroup == "image":
dataLength = len(part.data) if part.data else 0
mimeType = part.mimeType or "image"
dataPreview = f"Image data ({mimeType}, {dataLength} chars) - base64 encoded image content"
elif part.typeGroup == "container":
# Container ohne Daten überspringen wir bereits oben
dataPreview = "Container structure (no text content)"
else:
# Zeige Preview von extrahiertem Text
if part.data:
preview = part.data[:200] + "..." if len(part.data) > 200 else part.data
dataPreview = preview
else:
dataPreview = "(empty)"
elif contentFormat == "object":
dataLength = len(part.data) if part.data else 0
mimeType = part.mimeType or "binary"
if part.typeGroup == "image":
dataPreview = f"Base64 encoded image ({mimeType}, {dataLength} chars)"
else:
dataPreview = f"Base64 encoded binary ({mimeType}, {dataLength} chars)"
elif contentFormat == "reference":
dataPreview = part.metadata.get("documentReference", "reference")
originalFileName = part.metadata.get('originalFileName', 'N/A') originalFileName = part.metadata.get('originalFileName', 'N/A')
contentPartsIndex += f"\n{i}. ContentPart ID: {part.id}\n" contentPartsIndex += f"\n{i}. ContentPart ID: {part.id}\n"
@ -180,7 +141,6 @@ class StructureGenerator:
contentPartsIndex += f" Source: {part.metadata.get('documentId', 'unknown')}\n" contentPartsIndex += f" Source: {part.metadata.get('documentId', 'unknown')}\n"
contentPartsIndex += f" Original file name: {originalFileName}\n" contentPartsIndex += f" Original file name: {originalFileName}\n"
contentPartsIndex += f" Usage hint: {part.metadata.get('usageHint', 'N/A')}\n" contentPartsIndex += f" Usage hint: {part.metadata.get('usageHint', 'N/A')}\n"
contentPartsIndex += f" Data preview: {dataPreview}\n"
if not contentPartsIndex: if not contentPartsIndex:
contentPartsIndex = "\n(No content parts available)" contentPartsIndex = "\n(No content parts available)"

View file

@ -5,7 +5,7 @@ Base renderer class for all format renderers.
""" """
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Dict, Any, List from typing import Dict, Any, List, Tuple
from modules.datamodels.datamodelJson import supportedSectionTypes from modules.datamodels.datamodelJson import supportedSectionTypes
from modules.datamodels.datamodelDocument import RenderedDocument from modules.datamodels.datamodelDocument import RenderedDocument
import json import json
@ -201,9 +201,15 @@ class BaseRenderer(ABC):
def _extractTableData(self, sectionData: Dict[str, Any]) -> Tuple[List[str], List[List[str]]]: def _extractTableData(self, sectionData: Dict[str, Any]) -> Tuple[List[str], List[List[str]]]:
"""Extract table headers and rows from section data.""" """Extract table headers and rows from section data."""
# Normalize when elements array was passed in # Normalize when elements array was passed in
if isinstance(sectionData, list) and sectionData: if isinstance(sectionData, list):
candidate = sectionData[0] if sectionData and isinstance(sectionData[0], dict):
sectionData = candidate if isinstance(candidate, dict) else {} sectionData = sectionData[0]
else:
# Empty list or invalid structure - return empty table
return [], []
# Ensure sectionData is a dict before calling .get()
if not isinstance(sectionData, dict):
return [], []
headers = sectionData.get("headers", []) headers = sectionData.get("headers", [])
rows = sectionData.get("rows", []) rows = sectionData.get("rows", [])
return headers, rows return headers, rows
@ -227,8 +233,15 @@ class BaseRenderer(ABC):
def _extractHeadingData(self, sectionData: Dict[str, Any]) -> Tuple[int, str]: def _extractHeadingData(self, sectionData: Dict[str, Any]) -> Tuple[int, str]:
"""Extract heading level and text from section data.""" """Extract heading level and text from section data."""
# Normalize when elements array was passed in # Normalize when elements array was passed in
if isinstance(sectionData, list) and sectionData: if isinstance(sectionData, list):
sectionData = sectionData[0] if isinstance(sectionData[0], dict) else {} if sectionData and isinstance(sectionData[0], dict):
sectionData = sectionData[0]
else:
# Empty list or invalid structure - return default
return 1, ""
# Ensure sectionData is a dict before calling .get()
if not isinstance(sectionData, dict):
return 1, ""
level = sectionData.get("level", 1) level = sectionData.get("level", 1)
text = sectionData.get("text", "") text = sectionData.get("text", "")
return level, text return level, text
@ -249,8 +262,15 @@ class BaseRenderer(ABC):
def _extractCodeBlockData(self, sectionData: Dict[str, Any]) -> Tuple[str, str]: def _extractCodeBlockData(self, sectionData: Dict[str, Any]) -> Tuple[str, str]:
"""Extract code and language from section data.""" """Extract code and language from section data."""
# Normalize when elements array was passed in # Normalize when elements array was passed in
if isinstance(sectionData, list) and sectionData: if isinstance(sectionData, list):
sectionData = sectionData[0] if isinstance(sectionData[0], dict) else {} if sectionData and isinstance(sectionData[0], dict):
sectionData = sectionData[0]
else:
# Empty list or invalid structure - return default
return "", ""
# Ensure sectionData is a dict before calling .get()
if not isinstance(sectionData, dict):
return "", ""
code = sectionData.get("code", "") code = sectionData.get("code", "")
language = sectionData.get("language", "") language = sectionData.get("language", "")
return code, language return code, language
@ -258,8 +278,15 @@ class BaseRenderer(ABC):
def _extractImageData(self, sectionData: Dict[str, Any]) -> Tuple[str, str]: def _extractImageData(self, sectionData: Dict[str, Any]) -> Tuple[str, str]:
"""Extract base64 data and alt text from section data.""" """Extract base64 data and alt text from section data."""
# Normalize when elements array was passed in # Normalize when elements array was passed in
if isinstance(sectionData, list) and sectionData: if isinstance(sectionData, list):
sectionData = sectionData[0] if isinstance(sectionData[0], dict) else {} if sectionData and isinstance(sectionData[0], dict):
sectionData = sectionData[0]
else:
# Empty list or invalid structure - return default
return "", "Image"
# Ensure sectionData is a dict before calling .get()
if not isinstance(sectionData, dict):
return "", "Image"
base64Data = sectionData.get("base64Data", "") base64Data = sectionData.get("base64Data", "")
altText = sectionData.get("altText", "Image") altText = sectionData.get("altText", "Image")
return base64Data, altText return base64Data, altText

View file

@ -396,7 +396,7 @@ class RendererHtml(BaseRenderer):
source = element.get("source", "") source = element.get("source", "")
if content: if content:
source_text = f' <small><em>(Source: {source})</em></small>' if source else '' source_text = f' <small><em>(Source: {source})</em></small>' if source else ''
htmlParts.append(f'<p class="extracted-text">{content}{source_text}</p>') htmlParts.append(f'<p>{content}{source_text}</p>')
elif isinstance(element, dict): elif isinstance(element, dict):
# Regular paragraph element # Regular paragraph element
text = element.get("text", element.get("content", "")) text = element.get("text", element.get("content", ""))
@ -432,7 +432,7 @@ class RendererHtml(BaseRenderer):
source = element.get("source", "") source = element.get("source", "")
if content: if content:
source_text = f' <small><em>(Source: {source})</em></small>' if source else '' source_text = f' <small><em>(Source: {source})</em></small>' if source else ''
htmlParts.append(f'<p class="extracted-text">{content}{source_text}</p>') htmlParts.append(f'<p>{content}{source_text}</p>')
if htmlParts: if htmlParts:
return '\n'.join(htmlParts) return '\n'.join(htmlParts)
@ -577,18 +577,23 @@ class RendererHtml(BaseRenderer):
def _renderJsonImage(self, imageData: Dict[str, Any], styles: Dict[str, Any]) -> str: def _renderJsonImage(self, imageData: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a JSON image to HTML with placeholder for later replacement.""" """Render a JSON image to HTML with placeholder for later replacement."""
try: try:
import html
base64Data = imageData.get("base64Data", "") base64Data = imageData.get("base64Data", "")
altText = imageData.get("altText", "Image") altText = imageData.get("altText", "Image")
caption = imageData.get("caption", "") caption = imageData.get("caption", "")
# Escape HTML in altText and caption to prevent injection
altTextEscaped = html.escape(str(altText))
captionEscaped = html.escape(str(caption)) if caption else ""
if base64Data: if base64Data:
# Use data URI as placeholder - will be replaced with file path in _replaceImageDataUris # Use data URI as placeholder - will be replaced with file path in _replaceImageDataUris
# Include a marker so we can find and replace it # Include a marker so we can find and replace it
imageMarker = f"<!--IMAGE_MARKER:{len(base64Data)}:{altText[:50]}-->" imageMarker = f"<!--IMAGE_MARKER:{len(base64Data)}:{altTextEscaped[:50]}-->"
imgTag = f'<img src="data:image/png;base64,{base64Data}" alt="{altText}">' imgTag = f'<img src="data:image/png;base64,{base64Data}" alt="{altTextEscaped}">'
if caption: if captionEscaped:
return f'{imageMarker}<figure>{imgTag}<figcaption>{caption}</figcaption></figure>' return f'{imageMarker}<figure>{imgTag}<figcaption>{captionEscaped}</figcaption></figure>'
else: else:
return f'{imageMarker}{imgTag}' return f'{imageMarker}{imgTag}'
@ -712,12 +717,14 @@ class RendererHtml(BaseRenderer):
break break
if matchingImage: if matchingImage:
import html
# Use filename from image data (generated from section ID) # Use filename from image data (generated from section ID)
filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png") filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png")
# Replace with relative path (ohne Pfad, nur Dateiname) # Replace with relative path (ohne Pfad, nur Dateiname)
altText = matchingImage.get("altText", "Image") # Escape HTML in altText and caption to prevent injection
caption = matchingImage.get("caption", "") altText = html.escape(str(matchingImage.get("altText", "Image")))
caption = html.escape(str(matchingImage.get("caption", ""))) if matchingImage.get("caption") else ""
# Entferne IMAGE_MARKER Kommentar falls vorhanden # Entferne IMAGE_MARKER Kommentar falls vorhanden
imgTag = f'<img src="{filename}" alt="{altText}">' imgTag = f'<img src="{filename}" alt="{altText}">'