From 02808799686c1fc455639f14b774c6fa1e98b5e2 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sun, 28 Dec 2025 14:08:28 +0100 Subject: [PATCH] module tests completed --- .../mainServiceExtraction.py | 2 +- .../renderers/rendererHtml.py | 30 ++++++++++++------- .../serviceGeneration/subDocumentUtility.py | 10 +++++++ 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/modules/services/serviceExtraction/mainServiceExtraction.py b/modules/services/serviceExtraction/mainServiceExtraction.py index ba4bfb69..33edb6c7 100644 --- a/modules/services/serviceExtraction/mainServiceExtraction.py +++ b/modules/services/serviceExtraction/mainServiceExtraction.py @@ -240,7 +240,7 @@ class ExtractionService: partSummary["dataPreview"] = f"[Large data: {len(part.data)} chars - truncated]" extractionSummary["parts"].append(partSummary) - writeDebugFile(json.dumps(extractionSummary, indent=2, ensure_ascii=False), f"extraction_result_{doc.fileName}") + writeDebugFile(json.dumps(extractionSummary, indent=2, ensure_ascii=False), f"extraction_result_{doc.fileName}.txt") except Exception as e: logger.debug(f"Failed to write extraction debug file: {str(e)}") diff --git a/modules/services/serviceGeneration/renderers/rendererHtml.py b/modules/services/serviceGeneration/renderers/rendererHtml.py index 275302b6..17ac25b3 100644 --- a/modules/services/serviceGeneration/renderers/rendererHtml.py +++ b/modules/services/serviceGeneration/renderers/rendererHtml.py @@ -700,12 +700,19 @@ class RendererHtml(BaseRenderer): import base64 import re - # Find all image data URIs in HTML (verschiedene MIME-Types unterstützen) - # Pattern: data:image/[type];base64, - dataUriPattern = r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)' + # Find entire img tags with data URIs and replace them + # Pattern: + imgTagPattern = r']*>' - def replaceDataUri(match): - base64Data = match.group(1) + def replaceImgTag(match): + imgTag = match.group(0) + + # Extract base64 data from the img tag + base64Match = re.search(r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)', imgTag) + if not base64Match: + return imgTag # Return original if no base64 found + + base64Data = base64Match.group(1) # Find matching image in images list matchingImage = None @@ -721,12 +728,13 @@ class RendererHtml(BaseRenderer): # Use filename from image data (generated from section ID) filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png") - # Replace with relative path (ohne Pfad, nur Dateiname) - # Escape HTML in altText and caption to prevent injection - altText = html.escape(str(matchingImage.get("altText", "Image"))) + # Extract existing alt text or use from matchingImage + altMatch = re.search(r'alt="([^"]*)"', imgTag) + existingAlt = altMatch.group(1) if altMatch else "" + altText = html.escape(str(matchingImage.get("altText", existingAlt or "Image"))) caption = html.escape(str(matchingImage.get("caption", ""))) if matchingImage.get("caption") else "" - # Entferne IMAGE_MARKER Kommentar falls vorhanden + # Create new img tag with filename imgTag = f'{altText}' if caption: @@ -737,8 +745,8 @@ class RendererHtml(BaseRenderer): # Keep original if no match found return match.group(0) - # Replace all data URIs (auch IMAGE_MARKER Kommentare entfernen) - updatedHtml = re.sub(dataUriPattern, replaceDataUri, htmlContent) + # Replace all img tags with data URIs (auch IMAGE_MARKER Kommentare entfernen) + updatedHtml = re.sub(imgTagPattern, replaceImgTag, htmlContent) # Entferne IMAGE_MARKER Kommentare die übrig geblieben sind updatedHtml = re.sub(r'', '', updatedHtml) diff --git a/modules/services/serviceGeneration/subDocumentUtility.py b/modules/services/serviceGeneration/subDocumentUtility.py index abef95da..329f09f6 100644 --- a/modules/services/serviceGeneration/subDocumentUtility.py +++ b/modules/services/serviceGeneration/subDocumentUtility.py @@ -180,6 +180,16 @@ def convertDocumentDataToString(document_data: Any, file_extension: str) -> str: try: if document_data is None: return "" + if isinstance(document_data, bytes): + # WICHTIG: Decode bytes to string for text files (HTML, text, etc.) + try: + return document_data.decode('utf-8') + except UnicodeDecodeError: + # Fallback: try latin1 or return with error replacement + try: + return document_data.decode('latin1') + except Exception: + return document_data.decode('utf-8', errors='replace') if isinstance(document_data, str): return document_data if isinstance(document_data, dict):