From 02808799686c1fc455639f14b774c6fa1e98b5e2 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Sun, 28 Dec 2025 14:08:28 +0100
Subject: [PATCH] module tests completed
---
.../mainServiceExtraction.py | 2 +-
.../renderers/rendererHtml.py | 30 ++++++++++++-------
.../serviceGeneration/subDocumentUtility.py | 10 +++++++
3 files changed, 30 insertions(+), 12 deletions(-)
diff --git a/modules/services/serviceExtraction/mainServiceExtraction.py b/modules/services/serviceExtraction/mainServiceExtraction.py
index ba4bfb69..33edb6c7 100644
--- a/modules/services/serviceExtraction/mainServiceExtraction.py
+++ b/modules/services/serviceExtraction/mainServiceExtraction.py
@@ -240,7 +240,7 @@ class ExtractionService:
partSummary["dataPreview"] = f"[Large data: {len(part.data)} chars - truncated]"
extractionSummary["parts"].append(partSummary)
- writeDebugFile(json.dumps(extractionSummary, indent=2, ensure_ascii=False), f"extraction_result_{doc.fileName}")
+ writeDebugFile(json.dumps(extractionSummary, indent=2, ensure_ascii=False), f"extraction_result_{doc.fileName}.txt")
except Exception as e:
logger.debug(f"Failed to write extraction debug file: {str(e)}")
diff --git a/modules/services/serviceGeneration/renderers/rendererHtml.py b/modules/services/serviceGeneration/renderers/rendererHtml.py
index 275302b6..17ac25b3 100644
--- a/modules/services/serviceGeneration/renderers/rendererHtml.py
+++ b/modules/services/serviceGeneration/renderers/rendererHtml.py
@@ -700,12 +700,19 @@ class RendererHtml(BaseRenderer):
import base64
import re
- # Find all image data URIs in HTML (verschiedene MIME-Types unterstützen)
- # Pattern: data:image/[type];base64,
- dataUriPattern = r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)'
+ # Find entire img tags with data URIs and replace them
+ # Pattern:
+ imgTagPattern = r'
]*>'
- def replaceDataUri(match):
- base64Data = match.group(1)
+ def replaceImgTag(match):
+ imgTag = match.group(0)
+
+ # Extract base64 data from the img tag
+ base64Match = re.search(r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)', imgTag)
+ if not base64Match:
+ return imgTag # Return original if no base64 found
+
+ base64Data = base64Match.group(1)
# Find matching image in images list
matchingImage = None
@@ -721,12 +728,13 @@ class RendererHtml(BaseRenderer):
# Use filename from image data (generated from section ID)
filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png")
- # Replace with relative path (ohne Pfad, nur Dateiname)
- # Escape HTML in altText and caption to prevent injection
- altText = html.escape(str(matchingImage.get("altText", "Image")))
+ # Extract existing alt text or use from matchingImage
+ altMatch = re.search(r'alt="([^"]*)"', imgTag)
+ existingAlt = altMatch.group(1) if altMatch else ""
+ altText = html.escape(str(matchingImage.get("altText", existingAlt or "Image")))
caption = html.escape(str(matchingImage.get("caption", ""))) if matchingImage.get("caption") else ""
- # Entferne IMAGE_MARKER Kommentar falls vorhanden
+ # Create new img tag with filename
imgTag = f'
'
if caption:
@@ -737,8 +745,8 @@ class RendererHtml(BaseRenderer):
# Keep original if no match found
return match.group(0)
- # Replace all data URIs (auch IMAGE_MARKER Kommentare entfernen)
- updatedHtml = re.sub(dataUriPattern, replaceDataUri, htmlContent)
+ # Replace all img tags with data URIs (auch IMAGE_MARKER Kommentare entfernen)
+ updatedHtml = re.sub(imgTagPattern, replaceImgTag, htmlContent)
# Entferne IMAGE_MARKER Kommentare die übrig geblieben sind
updatedHtml = re.sub(r'', '', updatedHtml)
diff --git a/modules/services/serviceGeneration/subDocumentUtility.py b/modules/services/serviceGeneration/subDocumentUtility.py
index abef95da..329f09f6 100644
--- a/modules/services/serviceGeneration/subDocumentUtility.py
+++ b/modules/services/serviceGeneration/subDocumentUtility.py
@@ -180,6 +180,16 @@ def convertDocumentDataToString(document_data: Any, file_extension: str) -> str:
try:
if document_data is None:
return ""
+ if isinstance(document_data, bytes):
+ # WICHTIG: Decode bytes to string for text files (HTML, text, etc.)
+ try:
+ return document_data.decode('utf-8')
+ except UnicodeDecodeError:
+ # Fallback: try latin1 or return with error replacement
+ try:
+ return document_data.decode('latin1')
+ except Exception:
+ return document_data.decode('utf-8', errors='replace')
if isinstance(document_data, str):
return document_data
if isinstance(document_data, dict):